In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, TimestampType
from write_to_postgres import postgres_writer
from write_to_bigquery import bigquery_transform, bigquery_writer
from anomaly_alert import get_anomaly, anomaly_writer

In [None]:
spark = SparkSession.builder.appName("kafka-Clickstream")\
    .master("spark://myinstance.us-central1-b.c.kafkaproject-446911.internal:7077")\
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.4")\
    .config("spark.jars", "/home/aryanp211/spark/jars/postgresql-42.5.4.jar,/home/aryanp211/kafka/plugins/spark-bigquery-latest_2.12.jar")\
    .getOrCreate()

In [None]:
kafka_topic="clickstream"
bootstrap_servers="localhost:9092"

In [None]:
df = spark.readStream.format("kafka")\
.option("kafka.bootstrap.servers",bootstrap_servers).option("subscribe",kafka_topic).option("startingOffsets", "earliest").load()

clickstream_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("session_id", IntegerType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("page_url", StringType(), True),
    StructField("event_type", StringType(), True),
    StructField("referrer_url", StringType(), True),
    StructField("device", StringType(), True),
    StructField("location", StringType(), True),
    StructField("user_agent", StringType(), True),
    StructField("ip_address", StringType(), True),
    StructField("event_duration", IntegerType(), True),
    StructField("search_query", StringType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("order_id", StringType(), True),
    StructField("total_amount", FloatType(), True),
    StructField("rating", IntegerType(), True),
    StructField("cart_value", FloatType(), True),
    StructField("payment_method", StringType(), True),
    StructField("scroll_percentage", IntegerType(), True),
    StructField("login_method", StringType(), True),
    StructField("logout_time", TimestampType(), True),
    StructField("filter_applied", StringType(), True)
])

df = df.withColumn("data",from_json(df.value.cast("string"),schema=clickstream_schema)).select("data.*")


In [None]:
# BigQuery Data
daily_df, session_df, purchase_df = bigquery_transform(df)

In [None]:
#Anomaly Detection
anomaly_df = get_anomaly(df)

In [None]:
#Write raw data to postgres
pg_query = postgres_writer(df)

#Write aggregated data to BQ
daily_query = bigquery_writer(daily_df)
session_query = bigquery_writer(session_df)
purchase_query = bigquery_writer(purchase_df)

#Write anomalies to Kafka Topic
anomaly_query = anomaly_writer(anomaly_df,bootstrap_servers)

pg_query.awaitTermination()
daily_query.awaitTermination()
session_query.awaitTermination()
purchase_query.awaitTermination()
anomaly_query.awaitTermination()