In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import time
import os

os.environ[
    "PYSPARK_SUBMIT_ARGS"
] = "--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.2.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell"


APP_NAME = os.getenv("APP_NAME", "spark-streaming-app")
MASTER = os.getenv("MASTER", "local[*]")

spark = SparkSession.builder.appName("APP_NAME").master(MASTER).getOrCreate()
spark




:: loading settings :: url = jar:file:/home/carlos/.cache/pypoetry/virtualenvs/spark-ml-training-gphMGkfa-py3.10/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/carlos/.ivy2/cache
The jars for the packages stored in: /home/carlos/.ivy2/jars
org.apache.spark#spark-streaming-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e1881416-dae9-4e93-baad-1b523f9f7112;1.0
	confs: [default]
	found org.apache.spark#spark-streaming-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-log

In [85]:
# Read machine learning model

from pyspark.sql.types import StructType, StructField, FloatType, StringType, LongType, IntegerType
from pyspark.sql.functions import col, from_json, struct
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pandas as pd
import json
import pickle

# model_features = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

with open('model/catboost-classifier.pickle', 'rb') as handle:
    model = pickle.load(handle)

@pandas_udf(FloatType())
def predict(row: pd.Series) -> pd.Series:
    
    print('columns', row.columns)
    for i, v in row.iteritems():
        print('index: ', i, 'value: ', v)
    
    if row is None:
        return
    
    # d = json.loads(row)
    # data = pd.DataFrame.from_dict(d, orient = "index").transpose()  
    
    return model.predict(
        row,
        prediction_type='Probability', 
        ntree_start=0, ntree_end=model.get_best_iteration(), 
        thread_count=-1
    )

# from pyspark.sql.functions import udf
# predict_udf = udf(predict, StringType())

In [86]:
df_raw = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    # .option("kafka.bootstrap.servers", "kafka-cluster-kafka-bootstrap:9092")
    .option("subscribe", "to_predict")
    .option("startingOffsets", "latest")
    .load()
)

In [87]:
df_raw.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [88]:
# df_json = df.selectExpr('CAST(value AS STRING) as json')

# {
#   "id": 1.0079274744188029e+19,
#   "hour": 14103100,
#   "C1": 1005,
#   "banner_pos": 0,
#   "site_id": "85f751fd",
#   "site_domain": "c4e18dd6",
#   "site_category": "50e219e0",
#   "app_id": "febd1138",
#   "app_domain": "82e27996",
#   "app_category": "0f2161f8",
#   "device_id": "a99f214a",
#   "device_ip": "b72692c8",
#   "device_model": "99e427c9",
#   "device_type": 1,
#   "device_conn_type": 0,
#   "C14": 21611,
#   "C15": 320,
#   "C16": 50,
#   "C17": 2480,
#   "C18": 3,
#   "C19": 299,
#   "C20": 100111,
#   "C21": 61
# }

schema = StructType([
    StructField('id', LongType(), True),
    StructField("hour", IntegerType(), True),
    StructField("C1", IntegerType(), True),
    StructField("banner_pos", IntegerType(), True),
    StructField("site_id", StringType(), True),
    StructField("site_domain", StringType(), True),
    StructField("site_category", StringType(), True),
    StructField("app_id", StringType(), True),
    StructField("app_domain", StringType(), True),
    StructField("app_category", StringType(), True),
    StructField("device_id", StringType(), True),
    StructField("device_ip", StringType(), True),
    StructField("device_model", StringType(), True),
    StructField("device_type", IntegerType(), True),
    StructField("device_conn_type", IntegerType(), True),
    StructField("C14", IntegerType(), True),
    StructField("C15", IntegerType(), True),
    StructField("C16", IntegerType(), True),
    StructField("C17", IntegerType(), True),
    StructField("C18", IntegerType(), True),
    StructField("C19", IntegerType(), True),
    StructField("C20", IntegerType(), True),
    StructField("C21", IntegerType(), True)
])


In [89]:
df = df_raw.selectExpr("CAST(value AS STRING)") \
    .select(from_json("value", schema).alias("data")) \
    .select("data.*")

df.printSchema()

root
 |-- id: long (nullable = true)
 |-- hour: integer (nullable = true)
 |-- C1: integer (nullable = true)
 |-- banner_pos: integer (nullable = true)
 |-- site_id: string (nullable = true)
 |-- site_domain: string (nullable = true)
 |-- site_category: string (nullable = true)
 |-- app_id: string (nullable = true)
 |-- app_domain: string (nullable = true)
 |-- app_category: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- device_ip: string (nullable = true)
 |-- device_model: string (nullable = true)
 |-- device_type: integer (nullable = true)
 |-- device_conn_type: integer (nullable = true)
 |-- C14: integer (nullable = true)
 |-- C15: integer (nullable = true)
 |-- C16: integer (nullable = true)
 |-- C17: integer (nullable = true)
 |-- C18: integer (nullable = true)
 |-- C19: integer (nullable = true)
 |-- C20: integer (nullable = true)
 |-- C21: integer (nullable = true)



In [90]:
# dfInt = spark \
#     .readStream \
#     .load() \
#     .selectExpr("cast (body as string) as json") \
#     .select(from_json("json",schema).alias("data")) \
#     .withColumn("k", expr("uuid()")) \
#     .select("key", explode("data.features").alias("feat")) \
#     .select("feat.*", "key") \
#     .groupBy("k") \
#     .agg(*expressions) \
#     .drop("k") \
#     .na.drop() \
#     .withColumn("prediction", predict( (F.struct([col(x) for x in (features)]))))

In [91]:
features = ['hour',
 'C1',
 'banner_pos',
 'site_id',
 'site_domain',
 'site_category',
 'app_id',
 'app_domain',
 'app_category',
 'device_id',
 'device_ip',
 'device_model',
 'device_type',
 'device_conn_type',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20']

columns = ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category',
       'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip',
       'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16',
       'C17', 'C18', 'C19', 'C20', 'C21']

# df_result = df.withColumn("prediction", predict((struct([col(x) for x in features]))))

# df_result = df.select([col(x) for x in features])

df_result = df.withColumn("prediction", predict((struct([col(x) for x in columns]))))

In [92]:


# Create Temp View
# df.createOrReplaceTempView("dataframe")

# # Apply UDF in SQL query.
# df_results = spark.sql(f"select predict({', '.join(features)}) as score from dataframe")

# df_results = df_json.select(predict_udf("json").alias("value"))

In [93]:
# Start streaming on console

# df.writeStream \
#       .format("console") \
#       .start() \
#       .awaitTermination()

query = df_result \
    .writeStream \
    .format("console") \
    .start()

# query = df_result.writeStream \
#   .format("csv") \
#   .option("checkpointLocation", "checkpoint/") \
#   .option("path", "data/") \
#   .outputMode("append") \
#   .start()

time.sleep(10) # sleep 10 seconds

query.stop()

22/06/02 14:02:19 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-56981d04-78fc-4543-bfc8-979a987dcc8b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/06/02 14:02:19 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---+----+---+----------+-------+-----------+-------------+------+----------+------------+---------+---------+------------+-----------+----------------+---+---+---+---+---+---+---+---+----------+
| id|hour| C1|banner_pos|site_id|site_domain|site_category|app_id|app_domain|app_category|device_id|device_ip|device_model|device_type|device_conn_type|C14|C15|C16|C17|C18|C19|C20|C21|prediction|
+---+----+---+----------+-------+-----------+-------------+------+----------+------------+---------+---------+------------+-----------+----------------+---+---+---+---+---+---+---+---+----------+
+---+----+---+----------+-------+-----------+-------------+------+----------+------------+---------+---------+------------+-----------+----------------+---+---+---+---+---+---+---+---+----------+



22/06/02 14:02:19 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
columns Index(['hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category',
       'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip',
       'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16',
       'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')
index:  hour value:  0    14103100
Name: hour, dtype: int32
index:  C1 value:  0    1005
Name: C1, dtype: int32
index:  banner_pos value:  0    0
Name: banner_pos, dtype: int32
index:  site_id value:  0    1fbe01fe
Name: site_id, dtype: object
index:  site_domain value:  0    f3845767
Name: site_domain, dtype: object
index:  site_category value:  0    28905ebd
Name: site_category, dtype: object
index:  app_id value:  0    ecad2386
Name: app_id, dtype: object
index:  app_domain value:  0    7801e8d9
Name: app

In [None]:
stop

In [None]:
# # read from Kafka 
# df = spark.readStream.format("kafka") 
#   .option("kafka.bootstrap.servers", "{external_ip}:9092") 
#   .option("subscribe", "dsp").load() 
# # select the value field and apply the UDF     
# df = df.selectExpr("CAST(value AS STRING)")
# score_udf = udf(score, StringType())    
# df = df.select( score_udf("value").alias("value"))
# # Write results to Kafka 
# query = df.writeStream.format("kafka") 
#   .option("kafka.bootstrap.servers", "{external_ip}:9092") 
#   .option("topic", "preds") 
#   .option("checkpointLocation", "/temp").start()

In [None]:
# df = df.select('request_id', \
#     F.col('data').getItem('CPU').alias('CPU'), \
#     F.col('data').getItem('memory').alias('memory'), \
#     F.col('data').getItem('disk').alias('disk'))

# df.printSchema()

In [None]:
# Start streaming on console

# df.writeStream \
#       .format("console") \
#       .start() \
#       .awaitTermination()

query = df \
    .writeStream \
    .format("console") \
    .start()

time.sleep(10) # sleep 10 seconds

query.stop()

In [None]:
stop

In [None]:
!pip install scipy

In [None]:
# Create Temp View
df.createOrReplaceTempView("dataframe")

# Apply UDF in SQL query.
resultDF = spark.sql("select predict(*) as up_down_udf from dataframe")

In [None]:
@pandas_udf("Count long, Resampled long, Start timestamp, End timestamp", PandasUDFType.GROUPED_MAP)
def myudf(df):
  df = df.dropna()
  df = df.set_index("Timestamp")
  df.sort_index(inplace=True)

  # resample the dataframe
  resampled = pd.DataFrame()
  oidx = df.index
  nidx = pd.date_range(oidx.min(), oidx.max(), freq="30S")
  resampled["Value"] = df.Value.reindex(oidx.union(nidx)).interpolate('index').reindex(nidx)
  return pd.DataFrame([[len(df.index), len(resampled.index), df.index.min(), df.index.max()]], columns=["Count", "Resampled", "Start", "End"])

predictionStream = sensorStream.withWatermark("Timestamp", "90 minutes").groupBy(col("Name"), window(col("Timestamp"), "70 minutes", "5 minutes"))

predictionStream.apply(myudf).writeStream \
    .queryName("aggregates") \
    .format("memory") \
    .start()

In [None]:
import time

df = df.selectExpr("CAST(value AS STRING)").selectExpr("CAST(value AS STRING)")

# Start running the query that prints the running counts to the console
query = df \
    .writeStream \
    .format("console") \
    .start()

# # query.awaitTermination()

time.sleep(10) # sleep 10 seconds

query.stop()

In [None]:
query = values.writeStream \
    .format("json") \
    .outputMode("append")
    .start("./topic.json")
    
import time

time.sleep(10) # sleep 10 seconds

query.stop()

In [None]:
df.writeStream \
      .format("console") \
      .outputMode("append") \
      .start() \
      .awaitTermination()

In [None]:
spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka-cluster-kafka-bootstrap:9092") \
    .option("subscribe", "app_messages") \
    .option("startingOffsets", "latest") \
    .load() \
    .writeStream \
    .format("json") \
    .outputMode("append") \ # .option("checkpointLocation", bronze_checkpoint_loc_vehicle) \
    .start("topic.json")