In [1]:
println(s"Current spark version is ${spark.version}")

Current spark version is 2.4.4


In [2]:
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}
import org.apache.spark.sql.functions._

val dataSchema = new StructType()
    .add("target", IntegerType)
    .add("id", LongType)
    .add("raw_timestamp", StringType)
    .add("query_status", StringType)
    .add("author", StringType)
    .add("tweet", StringType)

    
val dataPath= "/home/jovyan/data/training.1600000.processed.noemoticon.csv"

val raw_sentiment = spark.read
    .format("csv")
    .option("header",false)
    .schema(dataSchema)
    .load(dataPath)
    .selectExpr("tweet")

println(s"Total tweets in file: ${raw_sentiment.count}")


Total tweets in file: 1600000


dataSchema = StructType(StructField(target,IntegerType,true), StructField(id,LongType,true), StructField(raw_timestamp,StringType,true), StructField(query_status,StringType,true), StructField(author,StringType,true), StructField(tweet,StringType,true))
dataPath = /home/jovyan/data/training.1600000.processed.noemoticon.csv
raw_sentiment = [tweet: string]


[tweet: string]

In [3]:
import java.util.{Calendar, Timer, TimerTask}

val timer = new Timer()

val task = new TimerTask {
  def run(): Unit = {
      val data = raw_sentiment.sample(fraction=0.00001,withReplacement=true)
          .withColumn("timestamp", lit(current_timestamp()))
      data.coalesce(1).write.format("json").mode("append").save("/home/jovyan/data/events-stream")
      println(s"${Calendar.getInstance().toInstant} - saved some data to the events stream!")
  } 
}

println("Streaming started!")

timer.schedule(task, 1000L, 1000L)

Streaming started!


timer = java.util.Timer@6d7e48a7
task = $anon$1@45781cc2


$anon$1@45781cc2

2020-02-08T00:40:55.280Z - saved some data to the events stream!
2020-02-08T00:41:01.212Z - saved some data to the events stream!
2020-02-08T00:41:08.299Z - saved some data to the events stream!
2020-02-08T00:41:14.028Z - saved some data to the events stream!
2020-02-08T00:41:19.384Z - saved some data to the events stream!
2020-02-08T00:41:24.762Z - saved some data to the events stream!
2020-02-08T00:41:30.394Z - saved some data to the events stream!
2020-02-08T00:41:35.419Z - saved some data to the events stream!
2020-02-08T00:41:40.783Z - saved some data to the events stream!
2020-02-08T00:41:46.026Z - saved some data to the events stream!
2020-02-08T00:41:51.453Z - saved some data to the events stream!
2020-02-08T00:41:57.157Z - saved some data to the events stream!
2020-02-08T00:42:02Z - saved some data to the events stream!
2020-02-08T00:42:07.854Z - saved some data to the events stream!
2020-02-08T00:42:13.270Z - saved some data to the events stream!
2020-02-08T00:42:18.085Z - sa

In [4]:
task.cancel()

true