In [1]:
#Code Snippet 38
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('TF-IDF_CountVec').getOrCreate()
from pyspark.ml.feature import Tokenizer,CountVectorizer,IDF
data = spark.read.csv('reviews_tf-idf.csv',header=True,inferSchema=True)
print("Initial Data")
data.show(truncate=False)
#Applying Tokenizer class which splits text on whitespaces
simple_tokenizer = Tokenizer(inputCol='reviews',outputCol='review_tokens')
simple_tokens = simple_tokenizer.transform(data)
print("Tokenizer Output - Splitting text on Whitespaces")
simple_tokens.show(truncate=False)
#Applying CountVectorizer and HashingTF to convert tokens to vectors of token count
#Applying CountVectorizer
count_vectors = CountVectorizer(inputCol='review_tokens',outputCol='countVec')
count_vectors_model = count_vectors.fit(simple_tokens)
countVector_featurized_data = count_vectors_model.transform(simple_tokens)
print("CountVectorizer Data")
countVector_featurized_data.select('review_tokens','countVec').show(truncate=False)
#Applying IDF on vectors of token count output from CountVectorizer
idf = IDF(inputCol='countVec',outputCol='features')
idf_model = idf.fit(countVector_featurized_data)
final_data = idf_model.transform(countVector_featurized_data)
print("Final Spark accepted Data - NLP Formatted Data ready to pass into any Machine Learning Model")
final_data.select('label','features').show(truncate=60)

Initial Data
+---------------------+-----+
|reviews              |label|
+---------------------+-----+
|It was just wonderful|1    |
|not so good          |0    |
|very very negative   |0    |
|super super duper    |1    |
|average quality      |0    |
+---------------------+-----+

Tokenizer Output - Splitting text on Whitespaces
+---------------------+-----+--------------------------+
|reviews              |label|review_tokens             |
+---------------------+-----+--------------------------+
|It was just wonderful|1    |[it, was, just, wonderful]|
|not so good          |0    |[not, so, good]           |
|very very negative   |0    |[very, very, negative]    |
|super super duper    |1    |[super, super, duper]     |
|average quality      |0    |[average, quality]        |
+---------------------+-----+--------------------------+

CountVectorizer Data
+--------------------------+---------------------------------+
|review_tokens             |countVec                         |
+-----