In [2]:
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{CountVectorizer, StringIndexer, IndexToString}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

In [3]:
def killAll() = {
    SparkSession
        .active
        .streams
        .active
        .foreach { x =>
                    val desc = x.lastProgress.sources.head.description
                    x.stop
                    println(s"Stopped ${desc}")
        }               
}

killAll: ()Unit


In [5]:
val dfInput = spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "spark-master-1:6667")
    .option("subscribe", "lab07_in")
    .load()
    .selectExpr("CAST(value AS STRING)")

Waiting for a Spark session to start...

dfInput = [value: string, jsonData: struct<uid: string, visits: array<struct<url:string,timestamp:bigint>>>]


[value: string, jsonData: struct<uid: string, visits: array<struct<url:string,timestamp:bigint>>>]

In [47]:
val schema = new StructType()
    .add("uid", StringType)
    .add("visits", ArrayType(new StructType()
        .add("url", StringType)
        .add("timestamp", LongType)))

schema = StructType(StructField(uid,StringType,true), StructField(visits,ArrayType(StructType(StructField(url,StringType,true), StructField(timestamp,LongType,true)),true),true))


StructType(StructField(uid,StringType,true), StructField(visits,ArrayType(StructType(StructField(url,StringType,true), StructField(timestamp,LongType,true)),true),true))

In [79]:
val dfUnpacked = dfInput
    .withColumn("jsonData", from_json(col("value"), schema))
    .select("jsonData.uid", "jsonData.visits")
    .withColumn("url", explode(col("visits.url")))
    .withColumn("domains", lower(callUDF("parse_url", col("url"), lit("HOST"))))
    .withColumn("domains", regexp_replace(col("domains"), "www.", ""))
    .select("uid", "domains")
    .groupBy("uid").agg(collect_list("domains").as("domains"))

dfUnpacked = [uid: string, domains: array<string>]


lastException: Throwable = null


[uid: string, domains: array<string>]

In [80]:
dfUnpacked.printSchema

root
 |-- uid: string (nullable = true)
 |-- domains: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [81]:
val model = PipelineModel.load("log_reg")

model = pipeline_0fe14b543b83


pipeline_0fe14b543b83

In [96]:
val dfPredict = model.transform(dfUnpacked)
    .withColumnRenamed("res", "gender_age")
    .select('uid, 'gender_age)

dfPredict = [uid: string, gender_age: string]


[uid: string, gender_age: string]

In [101]:
dfPredict.printSchema

root
 |-- uid: string (nullable = true)
 |-- gender_age: string (nullable = true)



In [98]:
// dfPredict
//     .selectExpr("CAST(uid AS STRING) AS key", "to_json(struct(*)) AS value")
//     .writeStream
//     .outputMode("update")
//     .format("console")
//     .trigger(Trigger.ProcessingTime("5 seconds"))
//     //.option("checkpointLocation", s"/tmp/$USER/chk")
//     .option("truncate", "false")
//     //.option("numRows", "20")
//     .start

org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@48464a6

-------------------------------------------
Batch: 0
-------------------------------------------
+---+-----+
|key|value|
+---+-----+
+---+-----+

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------+---------------------------------------------------------------------+
|key                                 |value                                                                |
+------------------------------------+---------------------------------------------------------------------+
|1d160259-73d8-4514-911e-757042b22812|{"uid":"1d160259-73d8-4514-911e-757042b22812","gender_age":"F:25-34"}|
|47565df3-13e3-4609-948c-b0853f66d773|{"uid":"47565df3-13e3-4609-948c-b0853f66d773","gender_age":"M:25-34"}|
|4766a8ab-e9b6-4e0d-b3be-59be354401d5|{"uid":"4766a8ab-e9b6-4e0d-b3be-59be354401d5","gender_age":"M:35-44"}|
|7302e78a-ec04-47ee-bb6d-2a2cc07f4845|{"uid":"7302e78a-ec04-47ee-bb6d-2a2cc07f4845","gender_age":"M:35-

In [102]:
val query = dfPredict
    .selectExpr("CAST(uid AS STRING) AS key", "to_json(struct(*)) AS value")
    .writeStream
    .trigger(Trigger.ProcessingTime("5 seconds"))
    .format("kafka")
    .option("checkpointLocation", "lab07-vvc")
    .option("kafka.bootstrap.servers", "10.0.0.5:6667")
    .option("topic", "lab07_out")
    .option("maxOffsetsPerTrigger", 200)
    .outputMode("update")
    .start

query = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@dfe6620


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@dfe6620

In [None]:
query.awaitTermination

In [None]:
killAll