In [1]:
println(s"Current spark version is ${spark.version}")

Intitializing Scala interpreter ...

Spark Web UI available at http://mbp-aleksei:4041
SparkContext available as 'sc' (version = 2.4.5, master = local[*], app id = local-1586357200484)
SparkSession available as 'spark'


Current spark version is 2.4.5


In [2]:
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}

val dataSchema = new StructType()
    .add("target", IntegerType)
    .add("id", LongType)
    .add("raw_timestamp", StringType)
    .add("query_status", StringType)
    .add("author", StringType)
    .add("tweet", StringType)

    
val dataPath= "./data/training.1600000.processed.noemoticon.csv"

val raw_sentiment = spark.read
    .format("csv")
    .option("header",false)
    .schema(dataSchema)
    .load(dataPath)
    .selectExpr("(case when target=4 then 1 else 0 end) as label","tweet")

raw_sentiment.groupBy($"label").count.show
val tweets = raw_sentiment.select("tweet")

+-----+------+
|label| count|
+-----+------+
|    1|800000|
|    0|800000|
+-----+------+



import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}
dataSchema: org.apache.spark.sql.types.StructType = StructType(StructField(target,IntegerType,true), StructField(id,LongType,true), StructField(raw_timestamp,StringType,true), StructField(query_status,StringType,true), StructField(author,StringType,true), StructField(tweet,StringType,true))
dataPath: String = ./data/training.1600000.processed.noemoticon.csv
raw_sentiment: org.apache.spark.sql.DataFrame = [label: int, tweet: string]
tweets: org.apache.spark.sql.DataFrame = [tweet: string]


In [3]:
tweets.show(10, false)

+-------------------------------------------------------------------------------------------------------------------+
|tweet                                                                                                              |
+-------------------------------------------------------------------------------------------------------------------+
|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|
|is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!    |
|@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                          |
|my whole body feels itchy and like its on fire                                                                     |
|@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.     |
|@Kwesidei not the whole crew                           

In [4]:
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row

val tokenizer = new Tokenizer()
    .setInputCol("tweet")
    .setOutputCol("words")

val hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("features")

val lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.001)

val fullPipeline = new Pipeline()
  .setStages(Array(tokenizer, hashingTF, lr))


import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
tokenizer: org.apache.spark.ml.feature.Tokenizer = tok_7a3e50a2a253
hashingTF: org.apache.spark.ml.feature.HashingTF = hashingTF_832426f01cd1
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_9cf4e7b48c69
fullPipeline: org.apache.spark.ml.Pipeline = pipeline_14f5873fda49


In [5]:
val preparePipeline = new Pipeline()
  .setStages(Array(tokenizer, hashingTF))

val modelPipeline = new Pipeline()
  .setStages(Array(lr))

preparePipeline: org.apache.spark.ml.Pipeline = pipeline_22cb45455aaa
modelPipeline: org.apache.spark.ml.Pipeline = pipeline_75af386658f3


In [6]:
val fullModel = fullPipeline.fit(raw_sentiment)
val prepareModel = preparePipeline.fit(raw_sentiment)
val preparedDf = prepareModel.transform(raw_sentiment)
val modelModel = modelPipeline.fit(preparedDf)

fullModel: org.apache.spark.ml.PipelineModel = pipeline_14f5873fda49
prepareModel: org.apache.spark.ml.PipelineModel = pipeline_22cb45455aaa
preparedDf: org.apache.spark.sql.DataFrame = [label: int, tweet: string ... 2 more fields]
modelModel: org.apache.spark.ml.PipelineModel = pipeline_75af386658f3


In [7]:
fullModel.write.overwrite().save("./fullModel/spark-ml-model")
prepareModel.write.overwrite().save("./prepareModel/spark-ml-model")
modelModel.write.overwrite().save("./modelModel/spark-ml-model")

In [8]:
val fullModelFromFile = PipelineModel.load("./fullModel/spark-ml-model")
val prepareModelFromFile = PipelineModel.load("./prepareModel/spark-ml-model")
val modelModelFromFile = PipelineModel.load("./modelModel/spark-ml-model")

fullModelFromFile: org.apache.spark.ml.PipelineModel = pipeline_14f5873fda49
prepareModelFromFile: org.apache.spark.ml.PipelineModel = pipeline_22cb45455aaa
modelModelFromFile: org.apache.spark.ml.PipelineModel = pipeline_75af386658f3


In [9]:
val fullModelResult = fullModelFromFile.transform(raw_sentiment)

fullModelResult.show(3)

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|               tweet|               words|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    0|@switchfoot http:...|[@switchfoot, htt...|(1000,[7,14,21,54...|[-0.9010125659402...|[0.28884245921785...|       1.0|
|    0|is upset that he ...|[is, upset, that,...|(1000,[170,193,22...|[1.84195706807744...|[0.86318000204741...|       0.0|
|    0|@Kenichan I dived...|[@kenichan, i, di...|(1000,[10,36,77,1...|[1.56488554961127...|[0.82705328017343...|       0.0|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



fullModelResult: org.apache.spark.sql.DataFrame = [label: int, tweet: string ... 5 more fields]


In [10]:
val prepareModelResult = prepareModelFromFile.transform(raw_sentiment)

prepareModelResult.show(3)

+-----+--------------------+--------------------+--------------------+
|label|               tweet|               words|            features|
+-----+--------------------+--------------------+--------------------+
|    0|@switchfoot http:...|[@switchfoot, htt...|(1000,[7,14,21,54...|
|    0|is upset that he ...|[is, upset, that,...|(1000,[170,193,22...|
|    0|@Kenichan I dived...|[@kenichan, i, di...|(1000,[10,36,77,1...|
+-----+--------------------+--------------------+--------------------+
only showing top 3 rows



prepareModelResult: org.apache.spark.sql.DataFrame = [label: int, tweet: string ... 2 more fields]


In [11]:
val modelModelResult = modelModelFromFile.transform(prepareModelResult)

modelModelResult.show(3)

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|               tweet|               words|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    0|@switchfoot http:...|[@switchfoot, htt...|(1000,[7,14,21,54...|[-0.9010125659402...|[0.28884245921785...|       1.0|
|    0|is upset that he ...|[is, upset, that,...|(1000,[170,193,22...|[1.84195706807744...|[0.86318000204741...|       0.0|
|    0|@Kenichan I dived...|[@kenichan, i, di...|(1000,[10,36,77,1...|[1.56488554961127...|[0.82705328017343...|       0.0|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



modelModelResult: org.apache.spark.sql.DataFrame = [label: int, tweet: string ... 5 more fields]


In [12]:
import org.apache.spark.sql.functions._

val getProbability = udf((prediction: org.apache.spark.ml.linalg.Vector) => prediction(1))

import org.apache.spark.sql.functions._
getProbability: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))


In [13]:
modelModelResult.select(getProbability($"probability").alias("clean_probability")).show

+-------------------+
|  clean_probability|
+-------------------+
| 0.7111575407821429|
|0.13681999795258062|
| 0.1729467198265621|
| 0.4445137910464958|
|0.03783763627521715|
|0.41759967706184536|
| 0.4753971538982056|
|  0.912918885817382|
|0.30397625759950175|
| 0.5326649357031025|
| 0.4390689703426488|
| 0.5978735997545636|
| 0.3332295562520034|
| 0.4913598594117323|
|0.04828581755526847|
|0.21764493477476207|
|0.26225986886407654|
| 0.5282380556537423|
| 0.7151514980011853|
| 0.7542266623347162|
+-------------------+
only showing top 20 rows

