In [None]:
#Anamoly Detection

def get_anomaly(df):
    clickstream_df = df.withWatermark("timestamp","10 minutes")\
                    .groupBy(window("timestamp","10 minutes","5 minutes")\
                    .agg(
                        count(when(df.event_type=="page_view",1)).alias("page_views"),
                        countDistinct(df.session_id).alias("active_sessions"),
                        (count(when(df.event_type=="purchase",1)) / 
                         count(when(df.event_type=="checkout_start"))).alias("purchase_conversion_rate"),
                        ((count(when(df.event_type=="checkout_start",1)) - count(when(df.event_type=="checkout_complete",1))) 
                        / count(when(df.event_type=="checkout_complete",1))).alias("checkout_abandonment_rate")
                    )
    
    #Calculating historical moving average for anamoly (3 windows for good balance)
    avg_window = Window.orderBy(clickstream_df.window.desc()).rowsBetween(-3,-1)
    anomaly_df = clickstream_df\
                    .withColumn("avg_page_views",avg(clickstream_df.page_views).over(avg_window))\
                    .withColumn("avg_active_sessions",avg(clickstream_df.active_sessions).over(avg_window))\
                    .withColumn("avg_purchase_conversion_rate",avg(clickstream_df.purchase_conversion_rate).over(avg_window))\
                    .withColumn("avg_checkout_abandonment_rate",avg(clickstream_df.checkout_abandonment_rate).over(avg_window))\
                    .withColumn("page_view_drop",clickstream_df.page_views < col("avg_page_views")*0.5)\
                    .withColumn("session_drop",clickstream_df.active_sessions < col("avg_active_sessions")*0.4)\
                    .withColumn("low_conversion",col("avg_purchase_conversion_rate")<0.1)\
                    .withColumn("high_abandonment",col("avg_checkout_abandonment_rate")>0.8)
    
    anomaly_df = anomaly_df.filter(
                                (col("page_view_drop") == True) |
                                (col("session_drop") == True) |
                                (col("low_conversion") == True) |
                                (col("high_abandonment") == True))\
                            .select("window","page_views","active_sessions","purchase_conversion_rate","checkout_abandonment_rate")

    return anomaly_df


In [None]:
def anomaly_writer(anomaly_df,bootstrap_servers):
    #Send Anomaly alert to a kafka topic
    anomaly_topic="clickstream_anomaly"
    anomaly_query = anomaly_df.selectExpr("to_json(struct(*)) as value")\
        .writeStream.format("kafka").option("kafka.bootstrap.servers",bootstrap_servers).option("topic",anomaly_topic).start()
    return anomaly_query