# Count Vectorize 


Count Vectorization is a computationally expensive operation. The step of count vectorization takes about 10 minutes to calculate. Instead of calculating a new count vector every time we conduct a new model, The count vectorization is done once and saved into memory.




In [1]:
import org.apache.spark.sql.types.LongType
var df = spark.read.format("csv")
  .option("sep", "\t")
  .option("header", "true")
  .load("../Projects/data/partitioned_hdfs")
  .withColumn("id",col("id").cast(LongType))
  .dropDuplicates("id")

In [2]:
import org.apache.spark.ml.feature
    .{VectorAssembler,StringIndexer,Tokenizer, StopWordsRemover, CountVectorizer}
import org.apache.spark.ml.Pipeline

var indexer = new StringIndexer()
    .setInputCol("group_id")
    .setOutputCol("label")

var tokenizer = new Tokenizer()
    .setInputCol("claims")
    .setOutputCol("tokens")

var remover = new StopWordsRemover()
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("sWord")

var cvec = new CountVectorizer()
    .setInputCol(remover.getOutputCol)
    .setOutputCol("cvec")
    .setVocabSize(10000)

var assembler = new VectorAssembler()
    .setInputCols(Array(cvec.getOutputCol))
    .setOutputCol("features")

var pipe = new Pipeline()
    .setStages(Array(
        indexer,
        tokenizer,
        remover,
        cvec,
        assembler
    ))
    

In [3]:
val cvec = pipe.fit(df)

In [4]:
val vec_df = cvec.transform(df)

In [5]:
vec_df.select("id","label","features")
    .write.save("../Projects/data/vec_df_hdfs")