In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, FloatType
from pyspark.sql.functions import from_json

# SparkSession başlatmak ve Kafka bağımlılığını yüklemek
spark = SparkSession.builder \
    .appName("RealTimeAnomalyDetection") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .config("spark.sql.streaming.checkpointLocation", "file:///C:/checkpoint_location") \
    .getOrCreate()

# Kafka'ya bağlanacak
spark.sparkContext.setLogLevel("WARN")

# Schema tanımlaması (dataframe'in doğru formatta olması için)
schema = StructType([
    StructField("V1", FloatType()),
    StructField("V6", FloatType()),
    StructField("V8", FloatType()),
    StructField("V13", FloatType()),
    StructField("V15", FloatType()),
    StructField("V19", FloatType()),
    StructField("V20", FloatType()),
    StructField("V21", FloatType()),
    StructField("V23", FloatType()),
    StructField("V24", FloatType()),
    StructField("V25", FloatType()),
    StructField("V26", FloatType()),
    StructField("V27", FloatType()),
    StructField("V28", FloatType()),
    StructField("year", FloatType()),
    StructField("month", FloatType()),
    StructField("day", FloatType()),
    StructField("hour", FloatType()),
    StructField("weekday", FloatType()),
    StructField("log_amount", FloatType()),
    StructField("Class", FloatType())
])

# Kafka'dan veri okuma
input_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "input_topic") \
    .load()

# Kafka verisini çözümle (value'yı STRING formatında almak)
json_data = input_stream.selectExpr("CAST(value AS STRING) as value")

# JSON verisini çözümleme
parsed_data = json_data.select(from_json("value", schema).alias("data"))

# Veriyi düzleştir (data sütunundaki her bir alt sütunu alıyoruz)
final_data = parsed_data.select("data.*")

# Anomali sınıflandırma işlemi veya diğer işlemler burada yapılabilir
# final_data.show()  # Bu komut yalnızca veriyi göstermek için kullanılır

# Streaming işlemi yapmak için output bir yerde depolanabilir (örneğin konsola veya dosyaya yazılabilir)
query = final_data.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Query'nin durmasını bekleyin (Bu işlem streaming işlemi çalıştırmaya devam eder)
query.awaitTermination()


Py4JJavaError: An error occurred while calling o113.load.
: java.lang.NoClassDefFoundError: Could not initialize class org.apache.spark.sql.kafka010.KafkaSourceProvider$
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.org$apache$spark$sql$kafka010$KafkaSourceProvider$$validateStreamOptions(KafkaSourceProvider.scala:338)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.sourceSchema(KafkaSourceProvider.scala:71)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:233)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:118)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:118)
	at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:36)
	at org.apache.spark.sql.streaming.DataStreamReader.loadInternal(DataStreamReader.scala:169)
	at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:145)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.ExceptionInInitializerError: Exception java.lang.NoClassDefFoundError: org/apache/kafka/common/serialization/ByteArraySerializer [in thread "Thread-4"]
	at org.apache.spark.sql.kafka010.KafkaSourceProvider$.<init>(KafkaSourceProvider.scala:601)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider$.<clinit>(KafkaSourceProvider.scala)
	... 20 more


In [14]:
!pip uninstall pyspark -y


Found existing installation: pyspark 3.5.3
Uninstalling pyspark-3.5.3:
  Successfully uninstalled pyspark-3.5.3


In [15]:
!pip install pyspark==3.3.0

Collecting pyspark==3.3.0
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
     ---------------------------------------- 0.0/281.3 MB ? eta -:--:--
     ---------------------------------------- 0.3/281.3 MB ? eta -:--:--
     ---------------------------------------- 0.8/281.3 MB 3.4 MB/s eta 0:01:24
     ---------------------------------------- 1.8/281.3 MB 3.6 MB/s eta 0:01:18
     ---------------------------------------- 2.6/281.3 MB 3.7 MB/s eta 0:01:16
     ---------------------------------------- 3.4/281.3 MB 3.5 MB/s eta 0:01:19
      --------------------------------------- 4.2/281.3 MB 3.6 MB/s eta 0:01:16
      --------------------------------------- 5.0/281.3 MB 3.7 MB/s eta 0:01:15
      --------------------------------------- 5.8/281.3 MB 3.7 MB/s eta 0:01:15
      --------------------------------------- 6.6/281.3 MB 3.8 MB/s eta 0:01:14
     - -------------------------------------- 7.6/281.3 MB 3.8 MB/s eta 0:01:12
     - -------------------------------------- 8.4/281.3 MB 3.9


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
