In [None]:
from pyspark.sql.types import (
    StructType,
    StructField,
    FloatType,
    StringType,
    LongType,
    IntegerType,
    DecimalType,
)
from pyspark.sql.types import (
    StructType,
    StructField,
    FloatType,
    StringType,
    LongType,
    IntegerType,
    DoubleType,
)
from pyspark.sql.functions import (
    split,
    regexp_replace,
    current_date,
    unix_timestamp,
    lit,
    current_timestamp,
)

from pyspark.sql.functions import col, from_json, struct, to_json
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

import pandas as pd
import pickle
import json
import time
import os


In [None]:
os.environ[
    "PYSPARK_SUBMIT_ARGS"
] = "--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.2.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,org.postgresql:postgresql:42.1.1 pyspark-shell"


APP_NAME = os.getenv("APP_NAME", "spark-streaming-app")
MASTER = os.getenv("MASTER", "local[*]")
KAFKA_HOST = "localhost:9092"  # "kafka-cluster-kafka-bootstrap:9092"

# MASTER = "spark://carloshkayser:7077"

spark = (
    SparkSession.builder.appName("Spark Structured Streaming Application")
    .master(MASTER)
    .getOrCreate()
)

spark


In [None]:
df_raw = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_HOST)
    .option("subscribe", "to_predict")
    .option("startingOffsets", "latest")
    .load()
)


In [None]:
df_raw.printSchema()


In [None]:
# {
#   "id": 1.0079274744188029e+19,
#   "hour": 14103100,
#   "C1": 1005,
#   "banner_pos": 0,
#   "site_id": "85f751fd",
#   "site_domain": "c4e18dd6",
#   "site_category": "50e219e0",
#   "app_id": "febd1138",
#   "app_domain": "82e27996",
#   "app_category": "0f2161f8",
#   "device_id": "a99f214a",
#   "device_ip": "b72692c8",
#   "device_model": "99e427c9",
#   "device_type": 1,
#   "device_conn_type": 0,
#   "C14": 21611,
#   "C15": 320,
#   "C16": 50,
#   "C17": 2480,
#   "C18": 3,
#   "C19": 299,
#   "C20": 100111,
#   "C21": 61
# }

schema = StructType(
    [
        StructField("id", DecimalType(38, 0), True),
        StructField("hour", IntegerType(), True),
        StructField("C1", IntegerType(), True),
        StructField("banner_pos", IntegerType(), True),
        StructField("site_id", StringType(), True),
        StructField("site_domain", StringType(), True),
        StructField("site_category", StringType(), True),
        StructField("app_id", StringType(), True),
        StructField("app_domain", StringType(), True),
        StructField("app_category", StringType(), True),
        StructField("device_id", StringType(), True),
        StructField("device_ip", StringType(), True),
        StructField("device_model", StringType(), True),
        StructField("device_type", IntegerType(), True),
        StructField("device_conn_type", IntegerType(), True),
        StructField("C14", IntegerType(), True),
        StructField("C15", IntegerType(), True),
        StructField("C16", IntegerType(), True),
        StructField("C17", IntegerType(), True),
        StructField("C18", IntegerType(), True),
        StructField("C19", IntegerType(), True),
        StructField("C20", IntegerType(), True),
        StructField("C21", IntegerType(), True),
    ]
)


In [None]:
df = (
    df_raw.selectExpr("CAST(value AS STRING)")
    .select(from_json("value", schema).alias("data"))
    .select("data.*")
)

df.printSchema()


In [None]:
from pyspark.ml import PipelineModel

# Read the model from disk
pipelineModel = PipelineModel.load("model/spark-logistic-regression-model")

# Apply machine learning pipeline to the data
results = pipelineModel.transform(df)

results.printSchema()


In [None]:
results = results.withColumn("processed_at", current_timestamp())

results = (
    results.withColumn("probability", results["probability"].cast("String"))
    .withColumn(
        "probabilityre",
        split(regexp_replace("probability", "^\[|\]", ""), ",")[1].cast(DoubleType()),
    )
    .select("id", "probabilityre", "processed_at")
    .withColumnRenamed("probabilityre", "probability")
)

results_kafka = results.select(
    to_json(struct("id", "probability", "processed_at")).alias("value")
)

results_postgres = results.select(
    "id", "probability", "processed_at"
)

In [None]:
results_kafka.printSchema()

In [None]:
results_postgres.printSchema()


### Logging the data stream in the console

In [None]:
query = results.select("id", "probability", "processed_at") \
    .writeStream \
    .format("console") \
    .start()

time.sleep(10)

query.stop()


### Inserting data stream transformation results into another Apache Kafka topic

In [None]:
query = results_kafka.writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("topic", "predictions") \
    .option("checkpointLocation", "/home/kayser/temp") \
    .outputMode("Append") \
    .start()

time.sleep(30)

query.stop()


### Inserting data stream into PostgreSQL database

In [None]:
def foreach_batch_function(df, epoch_id):

    df.write.format("jdbc").option(
        "url", "jdbc:postgresql://localhost:5432/postgres"
    ).option("driver", "org.postgresql.Driver").option("dbtable", "predictions").option(
        "user", "postgres"
    ).option(
        "password", "postgres"
    ).mode(
        "append"
    ).save()

results_postgres.writeStream.foreachBatch(foreach_batch_function).option(
    "checkpointLocation", "/home/kayser/temp"
).outputMode("update").start()


In [None]:
spark.streams.awaitAnyTermination()


In [None]:
stop


In [None]:
class PostgresSink:
    
    def __init__(self, jdbc: str):
        

In [None]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.6.jar

In [None]:
# Read machine learning model


# model_features = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

with open("model/catboost-classifier.pickle", "rb") as handle:
    model = pickle.load(handle)


@pandas_udf(FloatType())
def predict(row: pd.Series) -> pd.Series:

    print("columns", row.columns)
    for i, v in row.iteritems():
        print("index: ", i, "value: ", v)

    if row is None:
        return

    # d = json.loads(row)
    # data = pd.DataFrame.from_dict(d, orient = "index").transpose()

    return model.predict(
        row,
        prediction_type="Probability",
        ntree_start=0,
        ntree_end=model.get_best_iteration(),
        thread_count=-1,
    )


# from pyspark.sql.functions import udf
# predict_udf = udf(predict, StringType())


In [None]:
# dfInt = spark \
#     .readStream \
#     .load() \
#     .selectExpr("cast (body as string) as json") \
#     .select(from_json("json",schema).alias("data")) \
#     .withColumn("k", expr("uuid()")) \
#     .select("key", explode("data.features").alias("feat")) \
#     .select("feat.*", "key") \
#     .groupBy("k") \
#     .agg(*expressions) \
#     .drop("k") \
#     .na.drop() \
#     .withColumn("prediction", predict( (F.struct([col(x) for x in (features)]))))


In [None]:
features = [
    "hour",
    "C1",
    "banner_pos",
    "site_id",
    "site_domain",
    "site_category",
    "app_id",
    "app_domain",
    "app_category",
    "device_id",
    "device_ip",
    "device_model",
    "device_type",
    "device_conn_type",
    "C14",
    "C15",
    "C16",
    "C17",
    "C18",
    "C19",
    "C20",
]

columns = [
    "hour",
    "C1",
    "banner_pos",
    "site_id",
    "site_domain",
    "site_category",
    "app_id",
    "app_domain",
    "app_category",
    "device_id",
    "device_ip",
    "device_model",
    "device_type",
    "device_conn_type",
    "C14",
    "C15",
    "C16",
    "C17",
    "C18",
    "C19",
    "C20",
    "C21",
]

# df_result = df.withColumn("prediction", predict((struct([col(x) for x in features]))))

# df_result = df.select([col(x) for x in features])

df_result = df.withColumn("prediction", predict((struct([col(x) for x in columns]))))


In [None]:
# Create Temp View
# df.createOrReplaceTempView("dataframe")

# # Apply UDF in SQL query.
# df_results = spark.sql(f"select predict({', '.join(features)}) as score from dataframe")

# df_results = df_json.select(predict_udf("json").alias("value"))


In [None]:
# Start streaming on console

# df.writeStream \
#       .format("console") \
#       .start() \
#       .awaitTermination()

query = df_result.writeStream.format("console").start()

# query = df_result.writeStream \
#   .format("csv") \
#   .option("checkpointLocation", "checkpoint/") \
#   .option("path", "data/") \
#   .outputMode("append") \
#   .start()

time.sleep(10)  # sleep 10 seconds

query.stop()


In [None]:
stop


In [None]:
# # read from Kafka
# df = spark.readStream.format("kafka")
#   .option("kafka.bootstrap.servers", "{external_ip}:9092")
#   .option("subscribe", "dsp").load()
# # select the value field and apply the UDF
# df = df.selectExpr("CAST(value AS STRING)")
# score_udf = udf(score, StringType())
# df = df.select( score_udf("value").alias("value"))
# # Write results to Kafka
# query = df.writeStream.format("kafka")
#   .option("kafka.bootstrap.servers", "{external_ip}:9092")
#   .option("topic", "preds")
#   .option("checkpointLocation", "/temp").start()


In [None]:
# df = df.select('request_id', \
#     F.col('data').getItem('CPU').alias('CPU'), \
#     F.col('data').getItem('memory').alias('memory'), \
#     F.col('data').getItem('disk').alias('disk'))

# df.printSchema()


In [None]:
# Start streaming on console

# df.writeStream \
#       .format("console") \
#       .start() \
#       .awaitTermination()

query = df.writeStream.format("console").start()

time.sleep(10)  # sleep 10 seconds

query.stop()


In [None]:
stop


In [None]:
!pip install scipy

In [None]:
# Create Temp View
df.createOrReplaceTempView("dataframe")

# Apply UDF in SQL query.
resultDF = spark.sql("select predict(*) as up_down_udf from dataframe")


In [None]:
@pandas_udf(
    "Count long, Resampled long, Start timestamp, End timestamp",
    PandasUDFType.GROUPED_MAP,
)
def myudf(df):
    df = df.dropna()
    df = df.set_index("Timestamp")
    df.sort_index(inplace=True)

    # resample the dataframe
    resampled = pd.DataFrame()
    oidx = df.index
    nidx = pd.date_range(oidx.min(), oidx.max(), freq="30S")
    resampled["Value"] = (
        df.Value.reindex(oidx.union(nidx)).interpolate("index").reindex(nidx)
    )
    return pd.DataFrame(
        [[len(df.index), len(resampled.index), df.index.min(), df.index.max()]],
        columns=["Count", "Resampled", "Start", "End"],
    )


predictionStream = sensorStream.withWatermark("Timestamp", "90 minutes").groupBy(
    col("Name"), window(col("Timestamp"), "70 minutes", "5 minutes")
)

predictionStream.apply(myudf).writeStream.queryName("aggregates").format(
    "memory"
).start()


In [None]:
import time

df = df.selectExpr("CAST(value AS STRING)").selectExpr("CAST(value AS STRING)")

# Start running the query that prints the running counts to the console
query = df.writeStream.format("console").start()

# # query.awaitTermination()

time.sleep(10)  # sleep 10 seconds

query.stop()


In [None]:
query = values.writeStream \
    .format("json") \
    .outputMode("append")
    .start("./topic.json")
    
import time

time.sleep(10) # sleep 10 seconds

query.stop()

In [None]:
df.writeStream.format("console").outputMode("append").start().awaitTermination()


In [None]:
spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka-cluster-kafka-bootstrap:9092") \
    .option("subscribe", "app_messages") \
    .option("startingOffsets", "latest") \
    .load() \
    .writeStream \
    .format("json") \
    .outputMode("append") \ # .option("checkpointLocation", bronze_checkpoint_loc_vehicle) \
    .start("topic.json")