In [13]:
import pandas as pd
import numpy as np

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

from pyspark.ml.feature import Tokenizer, HashingTF, IDF

In [2]:
# setup spark
sc = pyspark.SparkContext()
ss = SparkSession(sc)

In [4]:
# define some text data
data_df = ss.createDataFrame([(1, "Andrew is one hell of a cool dude."), (2, "Andrew loves Resham very very much."), (2, "Resham also loves Andrew.")], ['id', 'text'])
data_df.show()

+---+--------------------+
| id|                text|
+---+--------------------+
|  1|Andrew is one hel...|
|  2|Andrew loves Resh...|
|  2|Resham also loves...|
+---+--------------------+



In [6]:
# tokenize
toke = Tokenizer(inputCol='text', outputCol='words')
take_a_toke = toke.transform(data_df)
take_a_toke.show()

+---+--------------------+--------------------+
| id|                text|               words|
+---+--------------------+--------------------+
|  1|Andrew is one hel...|[andrew, is, one,...|
|  2|Andrew loves Resh...|[andrew, loves, r...|
|  2|Resham also loves...|[resham, also, lo...|
+---+--------------------+--------------------+



In [26]:
# now hash TF & ID
cornedbeef = HashingTF(inputCol='words', outputCol='rawfeatures', numFeatures=16)
TF = cornedbeef.transform(take_a_toke)
#TF.take(1)
IDFtrans = IDF(inputCol='rawfeatures', outputCol='IDFFeatures')
IDFdata = IDFtrans.fit(TF).transform(TF)

In [27]:
IDFdata.show()

+---+--------------------+--------------------+--------------------+--------------------+
| id|                text|               words|         rawfeatures|         IDFFeatures|
+---+--------------------+--------------------+--------------------+--------------------+
|  1|Andrew is one hel...|[andrew, is, one,...|(16,[1,3,5,9,15],...|(16,[1,3,5,9,15],...|
|  2|Andrew loves Resh...|[andrew, loves, r...|(16,[1,2,6,8,9],[...|(16,[1,2,6,8,9],[...|
|  2|Resham also loves...|[resham, also, lo...|(16,[0,1,6,8],[1....|(16,[0,1,6,8],[0....|
+---+--------------------+--------------------+--------------------+--------------------+



In [28]:
idfpd = IDFdata.select(['id', 'text', 'IDFFeatures']).toPandas()
display(idfpd)

Unnamed: 0,id,text,IDFFeatures
0,1,Andrew is one hell of a cool dude.,"(0.0, 0.0, 0.0, 1.3862943611198906, 0.0, 0.693..."
1,2,Andrew loves Resham very very much.,"(0.0, 0.0, 0.6931471805599453, 0.0, 0.0, 0.0, ..."
2,2,Resham also loves Andrew.,"(0.6931471805599453, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [29]:
idfpd.IDFFeatures.values

array([SparseVector(16, {1: 0.0, 3: 1.3863, 5: 0.6931, 9: 0.5754, 15: 1.3863}),
       SparseVector(16, {1: 0.0, 2: 0.6931, 6: 0.2877, 8: 0.5754, 9: 0.2877}),
       SparseVector(16, {0: 0.6931, 1: 0.0, 6: 0.2877, 8: 0.2877})],
      dtype=object)

## unsure how to interpret this?