In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=bea5759a30a25969ab26b92c226f0e05f469ca0365375edf494ab50cce0ee180
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF , IDF , Tokenizer

In [5]:
spark = SparkSession.builder.appName("app").getOrCreate()

In [22]:
data = [
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic,regression,models,are,neat")
]

schema = ["label","text"]

In [23]:
data = spark.createDataFrame(data, schema)
data.show()

+-----+--------------------+
|label|                text|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  1.0|Logistic,regressi...|
+-----+--------------------+



In [24]:
tokenizer = Tokenizer(inputCol='text', outputCol='words')

In [25]:
tokenized_data = tokenizer.transform(data)
tokenized_data.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+
|label|text                               |words                                     |
+-----+-----------------------------------+------------------------------------------+
|0.0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |
|0.0  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
|1.0  |Logistic,regression,models,are,neat|[logistic,regression,models,are,neat]     |
+-----+-----------------------------------+------------------------------------------+



In [26]:
hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')

In [27]:
featurized_data = hashing_tf.transform(tokenized_data)
featurized_data.show()

+-----+--------------------+--------------------+--------------------+
|label|                text|               words|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[18700,19...|
|  0.0|I wish Java could...|[i, wish, java, c...|(262144,[19036,20...|
|  1.0|Logistic,regressi...|[logistic,regress...|(262144,[11534],[...|
+-----+--------------------+--------------------+--------------------+



In [28]:
idf = IDF(inputCol='rawFeatures', outputCol='features')

In [29]:
idf_model = idf.fit(featurized_data)

In [30]:
rescaled_data = idf_model.transform(featurized_data)

In [33]:
rescaled_data.select(['label','features']).show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                      |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(262144,[18700,19036,33808,66273,173558],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                   |
|0.0  |(262144,[19036,20719,55551,58672,98717,109547,192310],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])|
|1.0 