### Построение модели на RandomForest

In [42]:
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}

import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}


In [43]:
val dataSchema = new StructType()
    .add("target", IntegerType)
    .add("id", LongType)
    .add("raw_timestamp", StringType)
    .add("query_status", StringType)
    .add("author", StringType)
    .add("tweet", StringType)

    
val dataPath= "./../data/training.1600000.processed.noemoticon.csv"

val raw_sentiment = spark.read
    .format("csv")
    .option("header",false)
    .schema(dataSchema)
    .load(dataPath)
    .selectExpr("(case when target=4 then 1 else 0 end) as label","tweet")

raw_sentiment.groupBy($"label").count.show

+-----+------+
|label| count|
+-----+------+
|    1|800000|
|    0|800000|
+-----+------+



dataSchema: org.apache.spark.sql.types.StructType = StructType(StructField(target,IntegerType,true), StructField(id,LongType,true), StructField(raw_timestamp,StringType,true), StructField(query_status,StringType,true), StructField(author,StringType,true), StructField(tweet,StringType,true))
dataPath: String = ./../data/training.1600000.processed.noemoticon.csv
raw_sentiment: org.apache.spark.sql.DataFrame = [label: int, tweet: string]


In [44]:
val tokenizer = new Tokenizer()
    .setInputCol("tweet")
    .setOutputCol("words")

val hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("features")

// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
val labelIndexer = new StringIndexer()
  .setInputCol("label")
  .setOutputCol("indexedLabel")
  .fit(raw_sentiment)

// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
val featureIndexer = new VectorIndexer()
  .setInputCol("features")
  .setOutputCol("indexedFeatures")
  .setMaxCategories(4)

// Train a RandomForest model.
val rf = new RandomForestClassifier()
  .setLabelCol("indexedLabel")
  .setFeaturesCol("indexedFeatures")
  .setNumTrees(10)

// Convert indexed labels back to original labels.
val labelConverter = new IndexToString()
  .setInputCol("prediction")
  .setOutputCol("predictedLabel")
  .setLabels(labelIndexer.labels)

// Chain indexers and forest in a Pipeline.
val pipeline = new Pipeline()
  .setStages(Array(tokenizer, hashingTF, labelIndexer, featureIndexer, rf, labelConverter))

tokenizer: org.apache.spark.ml.feature.Tokenizer = tok_252a0128cf5d
hashingTF: org.apache.spark.ml.feature.HashingTF = hashingTF_c657e80c0127
labelIndexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_3d83aee0f371
featureIndexer: org.apache.spark.ml.feature.VectorIndexer = vecIdx_c3aa91637d20
rf: org.apache.spark.ml.classification.RandomForestClassifier = rfc_fe66e9c8f1c5
labelConverter: org.apache.spark.ml.feature.IndexToString = idxToStr_582200aa98f4
pipeline: org.apache.spark.ml.Pipeline = pipeline_e4bd24ccb25b


In [45]:
val model = pipeline.fit(raw_sentiment)

2020-01-26 17:39:36 WARN  MemoryStore:66 - Not enough space to cache rdd_414_5 in memory! (computed 46.0 MB so far)
2020-01-26 17:39:36 WARN  BlockManager:66 - Persisting block rdd_414_5 to disk instead.
2020-01-26 17:39:36 WARN  MemoryStore:66 - Not enough space to cache rdd_414_7 in memory! (computed 46.0 MB so far)
2020-01-26 17:39:36 WARN  BlockManager:66 - Persisting block rdd_414_7 to disk instead.
2020-01-26 17:39:36 WARN  MemoryStore:66 - Not enough space to cache rdd_414_4 in memory! (computed 46.0 MB so far)
2020-01-26 17:39:36 WARN  BlockManager:66 - Persisting block rdd_414_4 to disk instead.
2020-01-26 17:39:36 WARN  MemoryStore:66 - Not enough space to cache rdd_414_6 in memory! (computed 46.0 MB so far)
2020-01-26 17:39:36 WARN  BlockManager:66 - Persisting block rdd_414_6 to disk instead.
2020-01-26 17:39:37 WARN  MemoryStore:66 - Not enough space to cache rdd_414_0 in memory! (computed 71.5 MB so far)
2020-01-26 17:39:37 WARN  BlockManager:66 - Persisting block rdd_414

model: org.apache.spark.ml.PipelineModel = pipeline_e4bd24ccb25b


In [46]:
model.write.overwrite().save("./models/spark-ml-model-rf")

In [47]:
val sameModel = PipelineModel.load("./models/spark-ml-model-rf")

sameModel: org.apache.spark.ml.PipelineModel = pipeline_e4bd24ccb25b


In [48]:
val predictionsDF = sameModel.transform(raw_sentiment)

predictionsDF.show()

+-----+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+----------+--------------+
|label|               tweet|               words|            features|indexedLabel|     indexedFeatures|       rawPrediction|         probability|prediction|predictedLabel|
+-----+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+----------+--------------+
|    0|@switchfoot http:...|[@switchfoot, htt...|(1000,[7,14,21,54...|         0.0|(1000,[7,14,21,54...|[4.39665546729450...|[0.43966554672945...|       1.0|             1|
|    0|is upset that he ...|[is, upset, that,...|(1000,[170,193,22...|         0.0|(1000,[170,193,22...|[5.24037086071899...|[0.52403708607189...|       0.0|             0|
|    0|@Kenichan I dived...|[@kenichan, i, di...|(1000,[10,36,77,1...|         0.0|(1000,[10,36,77,1...|[5.89570496145148...|[0.5895704

predictionsDF: org.apache.spark.sql.DataFrame = [label: int, tweet: string ... 8 more fields]
