In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkSess').getOrCreate()

In [2]:
from pyspark.ml.feature import Tokenizer

In [4]:
sentences_df = spark.createDataFrame([
... (1, "This is an introduction to SPark MLlib"),
... (2, "MLlib includes libraries for classification and regression"),
... (3, "It also contains supporting tools for pipelines")],
... ["id", "sentence"])

In [5]:
sentences_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  1|This is an introd...|
|  2|MLlib includes li...|
|  3|It also contains ...|
+---+--------------------+



In [6]:
sent_token = Tokenizer(inputCol = "sentence", outputCol = "words")

In [8]:
sent_tokenized_df = sent_token.transform(sentences_df)

In [9]:
sent_tokenized_df.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|
|  2|MLlib includes li...|[mllib, includes,...|
|  3|It also contains ...|[it, also, contai...|
+---+--------------------+--------------------+



## TF-IDF
#### Term Frequency - Inverse Document Frequency

In [10]:
from pyspark.ml.feature import HashingTF, IDF

In [11]:
sentences_df

DataFrame[id: bigint, sentence: string]

In [12]:
sentences_df.take(1)

[Row(id=1, sentence='This is an introduction to SPark MLlib')]

In [13]:
sent_tokenized_df.take(1)

[Row(id=1, sentence='This is an introduction to SPark MLlib', words=['this', 'is', 'an', 'introduction', 'to', 'spark', 'mllib'])]

In [14]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures",
                     numFeatures=20)

In [15]:
sent_hfTF_df = hashingTF.transform(sent_tokenized_df)

In [16]:
sent_hfTF_df.show()

+---+--------------------+--------------------+--------------------+
| id|            sentence|               words|         rawFeatures|
+---+--------------------+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|(20,[6,8,9,10,13,...|
|  2|MLlib includes li...|[mllib, includes,...|(20,[2,4,11,12,15...|
|  3|It also contains ...|[it, also, contai...|(20,[1,4,6,8,11,1...|
+---+--------------------+--------------------+--------------------+



In [17]:
sent_hfTF_df.take(1)

[Row(id=1, sentence='This is an introduction to SPark MLlib', words=['this', 'is', 'an', 'introduction', 'to', 'spark', 'mllib'], rawFeatures=SparseVector(20, {6: 2.0, 8: 1.0, 9: 1.0, 10: 1.0, 13: 1.0, 15: 1.0}))]

In [18]:
idf = IDF(inputCol = "rawFeatures", outputCol = "idf_features")

In [19]:
idfModel = idf.fit(sent_hfTF_df)

In [21]:
tfidf_df = idfModel.transform(sent_hfTF_df)

In [22]:
tfidf_df.take(1)

[Row(id=1, sentence='This is an introduction to SPark MLlib', words=['this', 'is', 'an', 'introduction', 'to', 'spark', 'mllib'], rawFeatures=SparseVector(20, {6: 2.0, 8: 1.0, 9: 1.0, 10: 1.0, 13: 1.0, 15: 1.0}), idf_features=SparseVector(20, {6: 0.5754, 8: 0.2877, 9: 0.6931, 10: 0.6931, 13: 0.6931, 15: 0.2877}))]