# Stop Words Remover

Stop words remuves words such as the, a, and an, auxiliary verbs such as do, be, and will,
and prepositions such as on, around, and beneath.

In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.ml.feature.{Tokenizer,StopWordsRemover,CountVectorizer}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.linalg.Vector

## Create a set of documents

In [2]:
val df = List( (0,"The sun is shining"),
                (1,"The weather is sweet, sweet"),
                (2,"The sun is shining and the weather is sweet")).toDF("id","doc")

df = [id: int, doc: string]


[id: int, doc: string]

In [3]:
df.show()

+---+--------------------+
| id|                 doc|
+---+--------------------+
|  0|  The sun is shining|
|  1|The weather is sw...|
|  2|The sun is shinin...|
+---+--------------------+



## Vectorize the documents

In [4]:
val tokenizer = new Tokenizer().
  setInputCol("doc").
  setOutputCol("raw_words")

tokenizer = tok_9de3b95c0a54


tok_9de3b95c0a54

In [5]:
val remover = new StopWordsRemover().
  setInputCol("raw_words").
  setOutputCol("filtered_words")

remover = stopWords_d5dde8f20b91


stopWords_d5dde8f20b91

In [6]:
val cv = new CountVectorizer().
  setInputCol("filtered_words").
  setOutputCol("features")

cv = cntVec_afcac1c186f8


cntVec_afcac1c186f8

In [7]:
val pipeline = new Pipeline().
  setStages(Array(tokenizer, remover, cv))

pipeline = pipeline_73a6fa065fc2


pipeline_73a6fa065fc2

In [8]:
val df_v = pipeline.fit(df).transform(df)
df_v.show()

[Stage 0:>                                                          (0 + 0) / 4]+---+--------------------+--------------------+--------------------+--------------------+
| id|                 doc|           raw_words|      filtered_words|            features|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|  The sun is shining|[the, sun, is, sh...|      [sun, shining]| (5,[1,2],[1.0,1.0])|
|  1|The weather is sw...|[the, weather, is...|[weather, sweet,,...|(5,[0,3,4],[1.0,1...|
|  2|The sun is shinin...|[the, sun, is, sh...|[sun, shining, we...|(5,[0,1,2,3],[1.0...|
+---+--------------------+--------------------+--------------------+--------------------+



df_v = [id: int, doc: string ... 3 more fields]


[id: int, doc: string ... 3 more fields]

In [9]:
df_v.select("features").collect.map(row => row(0).asInstanceOf[Vector].toDense)

[[0.0,1.0,1.0,0.0,0.0], [1.0,0.0,0.0,1.0,1.0], [1.0,1.0,1.0,1.0,0.0]]