# Hashing trick
Hashing trick works by applying a hash function to the features and using their hash values 
as indices directly, rather than building a dictionary.

In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{Tokenizer,HashingTF}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.linalg.Vector

## Create a set of documents

In [2]:
val df = List( (0,"The sun is shining"),
                (1,"The weather is sweet, sweet"),
                (2,"The sun is shining and the weather is sweet")).toDF("id","doc")

df = [id: int, doc: string]


[id: int, doc: string]

In [3]:
df.show()

+---+--------------------+
| id|                 doc|
+---+--------------------+
|  0|  The sun is shining|
|  1|The weather is sw...|
|  2|The sun is shinin...|
+---+--------------------+



## Vectorize the documents

In [4]:
val tokenizer = new Tokenizer().
  setInputCol("doc").
  setOutputCol("words")

tokenizer = tok_5683f06fbb75


tok_5683f06fbb75

In [5]:
val hashingTF = new HashingTF().
  setInputCol("words").
  setOutputCol("features").
  setNumFeatures(10)

hashingTF = hashingTF_e98c19007874


hashingTF_e98c19007874

In [6]:
val pipeline = new Pipeline().
  setStages(Array(tokenizer, hashingTF))

pipeline = pipeline_448e4f6269e6


pipeline_448e4f6269e6

In [7]:
val df_v = pipeline.fit(df).transform(df).select("id","doc","features")
df_v.show()

+---+--------------------+--------------------+
| id|                 doc|            features|
+---+--------------------+--------------------+
|  0|  The sun is shining|(10,[0,1,6],[2.0,...|
|  1|The weather is sw...|(10,[0,1,2,4,7],[...|
|  2|The sun is shinin...|(10,[0,1,2,3,4,6]...|
+---+--------------------+--------------------+



df_v = [id: int, doc: string ... 1 more field]


[id: int, doc: string ... 1 more field]

In [8]:
df_v.select("features").collect.map(row => row(0).asInstanceOf[Vector].toDense)

[[2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0], [1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0], [3.0,2.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0]]