In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

cassandra_host = "192.168.31.188"
cassandra_user = "cassandra"
cassandra_pwd  = "cassandra"
cassandra_port = 9042
key_space      = "LogAnalysis"
table_name     = "ApacheLogs"
kafka_server   = "192.168.31.188:9092"
kafka_topic    = "logs"

#Spark Session creation configured to interact with MongoDB
spark = SparkSession.builder.appName("pyspark-notebook").\
    config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0,com.datastax.spark:spark-cassandra-connector-driver_2.12:3.0.0").\
    config("spark.cassandra.connection.host",cassandra_host).\
    config("spark.cassandra.auth.username",cassandra_user).\
    config("spark.cassandra.auth.password",cassandra_pwd).\
    getOrCreate()


:: loading settings :: url = jar:file:/home/mpp/.conda/envs/data-engineering/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/mpp/.ivy2/cache
The jars for the packages stored in: /home/mpp/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
com.datastax.spark#spark-cassandra-connector-driver_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-36966146-3c96-49be-9aae-e762c31f8235;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
	found com.datastax.spark#spark-cassandra-conne

In [2]:
kafka_params = {
    "kafka.bootstrap.servers": kafka_server,
    "subscribe": kafka_topic,
    "startingOffsets": "latest"  # Adjust this as needed
}

kafka_stream = spark.readStream \
    .format("kafka") \
    .options(**kafka_params) \
    .load()


In [3]:
from pyspark.sql.types import StringType, StructType


parsed_stream = kafka_stream.selectExpr(
    "CAST(value AS STRING) as kafka_message"
)

parsed_stream = parsed_stream.selectExpr("get_json_object(kafka_message, '$.message') as message")

In [4]:
# [Tue Aug 15 15:32:26 2023] [quos:notice] [pid 1100:tid 925] [client 20.38.24.44:13532] 
# If we transmit the capacitor, we can get to the COM hard drive through the cross-platform JSON array

from pyspark.sql.functions import udf
from drain3 import TemplateMiner
from drain3.file_persistence import FilePersistence
from drain3.template_miner_config import TemplateMinerConfig

from pyspark.sql.types import StructType,StructField, IntegerType,Row

persistence = FilePersistence("drain3_state.bin")
config = TemplateMinerConfig()
config.load("drain3.ini")
config.profiling_enabled = False
template_miner = TemplateMiner(persistence, config)

def parser(line):
    line = line.rstrip()
    split_line = line.split('] ')
    if len(split_line) < 5:
        return Row('cluster_id', 'date_time','message', 'template')(-1, None, None, None)
    message = split_line[4]
    date_time = split_line[0][1:]
    cluster = template_miner.match(message)
    if cluster is None:
        return Row('cluster_id', 'date_time','message', 'template')(-2, date_time, message, None)
    else:
        template = cluster.get_template()
        return Row('cluster_id', 'date_time','message', 'template')(cluster.cluster_id, date_time, message, template)

schema = StructType([StructField('cluster_id', IntegerType(), True),
                    StructField('date_time', StringType(), True),
                    StructField('message', StringType(), True),
                    StructField('template', StringType(), True)])

udf_parser = udf(parser, schema)


In [5]:
final_stream = parsed_stream.withColumn("parsed", udf_parser(parsed_stream["message"]))
final_stream = final_stream.select("parsed.*")

In [6]:
# query = final_stream.writeStream \
#     .outputMode("append") \
#     .format("console") \
#     .start()

# query.awaitTermination()

### Foreach batch method

In [7]:
def process_row(df, epoch_id):
    """Writes data to Cassandra and HDFS location

    Parameters
    ----------
    df : DataFrame
        Streaming Dataframe
    epoch_id : int
        Unique id for each micro batch/epoch
    """
    df.write\
        .format("org.apache.spark.sql.cassandra")\
        .mode('append')\
        .options(table='apachelogs', keyspace='loganalysis')\
        .save() #hot path

    pass
    # df.write.csv("hdfs://192.168.31.188:8020/output/apache_logs/",mode="append") #cold path

In [8]:
#Writes streaming dataframe to ForeachBatch console which ingests data to Cassandra
final_stream \
    .writeStream \
    .option("checkpointLocation", "checkpoint/data") \
    .foreachBatch(process_row) \
    .start() \
    .awaitTermination()

23/08/16 16:40:12 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/08/16 16:40:37 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
23/08/16 16:40:37 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
23/08/16 16:40:37 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
23/08/16 16:40:37 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
23/08/16 16:40:37 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when Ka

KeyboardInterrupt: 