# 5.0 Event Streaming

###### Author: Yeap Jie Shen, Gan Yee Jing
###### Last Edited: 02/09/2024

## 5.3 Kafka Streaming (Predictions) 
### 5.2.1 Importing Necessary Libraries

In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml import Pipeline
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType

import sys

sys.path.append(r'/home/student/RDS2S3G4_CLO2_B')

from kafka import KafkaConsumer, KafkaProducer
from data_stores.vectorArrayConverter import VectorArrayConverter
import json

spark = SparkSession.builder.appName('Kafka Streaming').getOrCreate()

24/09/02 17:38:09 WARN Utils: Your hostname, Gan. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/02 17:38:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/02 17:38:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Defining variables and loading models

# Assemble features into a feature vector
assembler = VectorAssembler(inputCols = ['1tf_idf_content', '1gram_word2vec_content'], outputCol = 'features')

# Loading pretrained RandomForestClassificationModel
rf_model = RandomForestClassificationModel.load(r'../model/best_model')

                                                                                

In [3]:
# kafka-topics.sh --delete --bootstrap-server localhost:9092 --topic DrugOffences
# kafka-topics.sh --delete --bootstrap-server localhost:9092 --topic MurderAndHomicide

# kafka-topics.sh --create --bootstrap-server localhost:9092 --replication-factor 1 --partitions 1 --topic DrugOffences
# kafka-topics.sh --create --bootstrap-server localhost:9092 --replication-factor 1 --partitions 1 --topic MurderAndHomicide

# Define the Kafka topic to subscribe to
topic_name = 'ProcessedCrimeNews'

# Initialize the Kafka consumer
consumer = KafkaConsumer(topic_name, bootstrap_servers = 'localhost:9092', auto_offset_reset = 'earliest', value_deserializer = lambda x: x.decode('utf-8'))

# Initialize the Kafka Producer
producer = KafkaProducer(value_serializer = lambda v: json.dumps(v).encode('utf-8'), bootstrap_servers = 'localhost:9092')

# Consume messages from Kafka
try:
    print(f"Subscribing to topic '{topic_name}'")
    count = 0
    for message in consumer:
        json_value = json.loads(message.value)
        
        df = spark.createDataFrame([Row(**json_value)])
            
        df = (
            df
            .withColumn('1tf_idf_content', VectorArrayConverter.array_to_vector('1tf_idf_content'))
            .withColumn('1gram_word2vec_content', VectorArrayConverter.array_to_vector('1gram_word2vec_content'))
        )

        df = assembler.transform(df)
        
        # Use the loaded model to make predictions
        df_predictions = rf_model.transform(df)

        # Outputting relevant information
        count += df_predictions.count()
        print('Processed', count)
        
        df_predictions.select('headline', 'prediction').show()

        df_predictions = (
            df_predictions
            .withColumn('1tf_idf_content', VectorArrayConverter.vector_to_array(df_predictions['1tf_idf_content']))
            .withColumn('1gram_word2vec_content', VectorArrayConverter.vector_to_array(df_predictions['1gram_word2vec_content']))
        )

        prediction_list = (
            df_predictions
            .select('url', 'headline', 'datetime', 'author', 'publisher', '1tf_idf_content', '1gram_word2vec_content', 'prediction')
            .collect()
        )

        # 0 -> DrugOffences, 1 -> MurderAndHomicide
        for row in prediction_list:
            if row['prediction'] == 0.0:
                producer.send(topic = 'DrugOffences', value = row)
                print('Published to DrugOffences')
            elif row['prediction'] == 1.0:
                producer.send(topic = 'MurderAndHomicide', value = row)
                print('Published to MurderAndHomicide')

except KeyboardInterrupt:
    print("Stopped by user")
finally:
    consumer.close()

Subscribing to topic 'ProcessedCrimeNews'


                                                                                

Processed 1


24/09/02 17:38:25 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:25 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:25 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|altantuyas family...|       1.0|
+--------------------+----------+



24/09/02 17:38:26 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Published to MurderAndHomicide


                                                                                

Processed 2


24/09/02 17:38:30 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:31 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:31 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|ambank founders m...|       1.0|
+--------------------+----------+



24/09/02 17:38:32 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Published to MurderAndHomicide
Processed 3


24/09/02 17:38:35 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:35 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:35 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|excops death pena...|       1.0|
+--------------------+----------+



24/09/02 17:38:36 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Published to MurderAndHomicide
Processed 4


24/09/02 17:38:39 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:39 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:39 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|man pleads guilty...|       1.0|
+--------------------+----------+



24/09/02 17:38:40 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Published to MurderAndHomicide
Processed 5


24/09/02 17:38:42 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:42 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:43 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|police arrest two...|       0.0|
+--------------------+----------+



24/09/02 17:38:43 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Published to DrugOffences
Processed 6


24/09/02 17:38:46 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:46 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:46 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|four charged with...|       1.0|
+--------------------+----------+



24/09/02 17:38:47 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Published to MurderAndHomicide
Processed 7


24/09/02 17:38:49 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:49 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:49 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|november defence ...|       3.0|
+--------------------+----------+



24/09/02 17:38:50 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Processed 8


24/09/02 17:38:52 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:52 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:53 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|man walks free af...|       5.0|
+--------------------+----------+



24/09/02 17:38:53 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Processed 9


24/09/02 17:38:56 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:56 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:56 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|police uncover pa...|       1.0|
+--------------------+----------+



24/09/02 17:38:57 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Published to MurderAndHomicide
Processed 10


24/09/02 17:38:59 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:59 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:38:59 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|johor broker lose...|       2.0|
+--------------------+----------+



24/09/02 17:39:00 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Processed 11


24/09/02 17:39:02 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:39:03 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:39:03 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|businessman plead...|       0.0|
+--------------------+----------+



24/09/02 17:39:04 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Published to DrugOffences
Processed 12


24/09/02 17:39:06 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:39:06 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 17:39:06 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


+--------------------+----------+
|            headline|prediction|
+--------------------+----------+
|asean secgen meet...|       3.0|
+--------------------+----------+



24/09/02 17:39:07 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Stopped by user


In [12]:
drug_consumer = KafkaConsumer('DrugOffences', bootstrap_servers = 'localhost:9092', auto_offset_reset = 'earliest', value_deserializer = lambda x: x.decode('utf-8'))

try:
    for message in drug_consumer:
        json_value = json.loads(message.value)

        news = {
            key: value 
            for (key, value) in 
            zip(['url', 'headline', 'datetime', 'author', 'publisher', '1tf_idf_content', '1gram_word2vec_content', 'prediction'], json_value)
        }
        
        # print(json_value)
        print(f"Predicted Category: {news['prediction']} Headline: {news['headline']:55s} Publisher: {news['publisher']:20s} Date Published: {news['datetime']:20s} URL: {news['url']:50s}")
        print()

except KeyboardInterrupt:
    print("Stopped by user")
finally:
    drug_consumer.close()

Predicted Category: 0.0 Headline: police arrest two traffickers seize rm mln in drugs     Publisher: Selangor Journal     Date Published: 2024-08-24T22:08:01+08:00 URL: https://selangorjournal.my/2024/08/police-arrest-two-traffickers-seize-rm2-35-mln-in-drugs/

Predicted Category: 0.0 Headline: businessman pleads not guilty to cheating charge        Publisher: Selangor Journal     Date Published: 2024-08-29T09:28:22+08:00 URL: https://selangorjournal.my/2024/08/businessman-pleads-not-guilty-to-cheating-charge/

Stopped by user


In [13]:
murder_consumer = KafkaConsumer('MurderAndHomicide', bootstrap_servers = 'localhost:9092', auto_offset_reset = 'earliest', value_deserializer = lambda x: x.decode('utf-8'))

try:
    for message in murder_consumer:
        json_value = json.loads(message.value)

        news = {
            key: value 
            for (key, value) in 
            zip(['url', 'headline', 'datetime', 'author', 'publisher', '1tf_idf_content', '1gram_word2vec_content', 'prediction'], json_value)
        }
        
        # print(json_value)
        print(f"Predicted Category: {news['prediction']} Headline: {news['headline']:55s} Publisher: {news['publisher']:20s} Date Published: {news['datetime']:20s} URL: {news['url']:50s}")
        print()

except KeyboardInterrupt:
    print("Stopped by user")
finally:
    murder_consumer.close()

Predicted Category: 1.0 Headline: altantuyas family files bankruptcy notice against razak baginda Publisher: Selangor Journal     Date Published: 2024-08-29T17:10:35+08:00 URL: https://selangorjournal.my/2024/08/altantuyas-family-files-bankruptcy-notice-against-razak-baginda/

Predicted Category: 1.0 Headline: ambank founders murder federal court dismisses extow truck drivers death penalty review Publisher: Selangor Journal     Date Published: 2024-08-21T16:02:37+08:00 URL: https://selangorjournal.my/2024/08/ambank-founders-murder-federal-court-dismisses-ex-tow-truck-drivers-death-penalty-review/

Predicted Category: 1.0 Headline: excops death penalty reinstated for stepdaughters murder Publisher: Selangor Journal     Date Published: 2024-08-29T14:05:50+08:00 URL: https://selangorjournal.my/2024/08/ex-cops-death-penalty-reinstated-for-stepdaughters-murder/

Predicted Category: 1.0 Headline: man pleads guilty to kicking hitting wife with broom    Publisher: Selangor Journal     Date Pub

In [11]:
# Stop Spark session
spark.stop()