In [1]:
#Code Snippet 37
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('TF-IDF_HashTF').getOrCreate()
from pyspark.ml.feature import Tokenizer,HashingTF,IDF
data = spark.read.csv('reviews_tf-idf.csv',header=True,inferSchema=True)
print("Initial Data")
data.show(truncate=False)
#Applying Tokenizer class which splits text on whitespaces
simple_tokenizer = Tokenizer(inputCol='reviews',outputCol='review_tokens')
simple_tokens = simple_tokenizer.transform(data)
print("Tokenizer Output - Splitting text on Whitespaces")
simple_tokens.show(truncate=False)
#Applying HashingTF
hashingtf_vectors = HashingTF(inputCol='review_tokens',outputCol='hashVec')
HashingTF_featurized_data = hashingtf_vectors.transform(simple_tokens)
print("HashingTF Data")
HashingTF_featurized_data.select('review_tokens','hashVec').show(truncate=40)
#Applying IDF on vectors of token count output from HashingTF
idf = IDF(inputCol='hashVec',outputCol='features')
idf_model = idf.fit(HashingTF_featurized_data)
final_data = idf_model.transform(HashingTF_featurized_data)
print("Final Spark accepted Data - NLP Formatted Data ready to pass into any Machine Learning Model")
final_data.select('label','features').show(truncate=60)

Initial Data
+---------------------+-----+
|reviews              |label|
+---------------------+-----+
|It was just wonderful|1    |
|not so good          |0    |
|very very negative   |0    |
|super super duper    |1    |
|average quality      |0    |
+---------------------+-----+

Tokenizer Output - Splitting text on Whitespaces
+---------------------+-----+--------------------------+
|reviews              |label|review_tokens             |
+---------------------+-----+--------------------------+
|It was just wonderful|1    |[it, was, just, wonderful]|
|not so good          |0    |[not, so, good]           |
|very very negative   |0    |[very, very, negative]    |
|super super duper    |1    |[super, super, duper]     |
|average quality      |0    |[average, quality]        |
+---------------------+-----+--------------------------+

HashingTF Data
+--------------------------+----------------------------------------+
|             review_tokens|                                 hashVec