In [1]:
import findspark
findspark.init('/home/spark/spark-2.1.0-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('NlpVec').getOrCreate()

In [17]:
from pyspark.ml.feature import Tokenizer,HashingTF,Tokenizer,IDF

In [6]:
sentenceDataFrame = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["id", "sentence"])

In [7]:
sentenceDataFrame.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|0.0|Hi I heard about ...|
|0.0|I wish Java could...|
|1.0|Logistic regressi...|
+---+--------------------+



In [8]:
tockenizer = Tokenizer(inputCol = 'sentence', outputCol = 'words')

In [10]:
words_data = tockenizer.transform(sentenceDataFrame)

In [12]:
words_data.show(truncate=False)

+---+-----------------------------------+------------------------------------------+
|id |sentence                           |words                                     |
+---+-----------------------------------+------------------------------------------+
|0.0|Hi I heard about Spark             |[hi, i, heard, about, spark]              |
|0.0|I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
|1.0|Logistic regression models are neat|[logistic, regression, models, are, neat] |
+---+-----------------------------------+------------------------------------------+



In [13]:
hashing_tf = HashingTF(inputCol = 'words', outputCol = 'rawFeatures')

In [14]:
featurized_data = hashing_tf.transform(words_data)

In [15]:
featurized_data.show()

+---+--------------------+--------------------+--------------------+
| id|            sentence|               words|         rawFeatures|
+---+--------------------+--------------------+--------------------+
|0.0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[24417,49...|
|0.0|I wish Java could...|[i, wish, java, c...|(262144,[20719,24...|
|1.0|Logistic regressi...|[logistic, regres...|(262144,[13671,91...|
+---+--------------------+--------------------+--------------------+



In [19]:
idf = IDF(inputCol ='rawFeatures', outputCol = 'feaures')

In [20]:
idf_model = idf.fit(featurized_data)

In [21]:
rescaled_data = idf_model.transform(featurized_data)

In [30]:
rescaled_data.show()
rescaled_data.select('id','feaures').show(truncate=False)


+---+--------------------+--------------------+--------------------+--------------------+
| id|            sentence|               words|         rawFeatures|             feaures|
+---+--------------------+--------------------+--------------------+--------------------+
|0.0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[24417,49...|(262144,[24417,49...|
|0.0|I wish Java could...|[i, wish, java, c...|(262144,[20719,24...|(262144,[20719,24...|
|1.0|Logistic regressi...|[logistic, regres...|(262144,[13671,91...|(262144,[13671,91...|
+---+--------------------+--------------------+--------------------+--------------------+

+---+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |feaures                                                                                                                                                              

In [31]:
from pyspark.ml.feature import CountVectorizer



In [32]:
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])


In [33]:
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [36]:
cv = CountVectorizer(inputCol = 'words' , outputCol = 'features', vocabSize=3, minDF = 2.0)

In [37]:
model = cv.fit(df)

In [39]:
result = model.transform(df)

In [41]:
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

