In [None]:
import configparser
import findspark
import os
import pyspark
import time

from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from pyspark.sql.functions import col, from_json

In [None]:
config = configparser.ConfigParser()
config.read('config.ini')

In [None]:
TOPIC = config['LOCAL']['TOPIC']
SERVER = config['LOCAL']['SERVER']

In [None]:
findspark.init()

In [None]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1 pyspark-shell'

In [None]:
conf = pyspark.SparkConf().set('spark.driver.host','127.0.0.1')
sc = pyspark.SparkContext(master='local', appName='Machine-IoT-Monitor',conf=conf)
spark = SparkSession.builder.appName('Machine-IoT-Monitor').getOrCreate()

In [None]:
# pyspark.SparkContext(app='MachineMonitor').setLogLevel('ERROR')

In [None]:
schema = StructType([
    StructField('id_machine', StringType(), True), 
    StructField('temperature', IntegerType(), True),
    StructField('rpm', IntegerType(), True),
    StructField('timestamp', StringType(), True) 
])

In [None]:
df = spark \
    .readStream \
    .format('kafka') \
    .option('kafka.bootstrap.servers', SERVER) \
    .option('subscribe', TOPIC) \
    .load()

In [None]:
df = df.selectExpr('CAST(value AS STRING)')

In [None]:
# from pyspark.sql.functions import col, from_json
# display(
#   df.select(col('value'), from_json(col('value'), topic_schema, {"mode" : "PERMISSIVE"}))
# )

In [None]:
df = df.withColumn(
    'jsonData', 
    from_json(col('value'), schema)
).select('jsonData.*')


In [None]:
df.printSchema()

---

In [None]:
# df_streaming = df.groupby('id_machine').mean('temperature', 'rpm')

In [None]:
# df_streaming = \
#     df_streaming.select(
#         col('id_machine'),
#         col('avg(temperature)').alias('avg_temperature'),
#         col('avg(rpm)').alias('avg_rpm')
#     )

df_streaming = df

In [None]:
# df_streaming.printSchema()

---

In [None]:
temp_table = f"machine_data_{time.strftime('%H%M%S', time.localtime())}"

In [None]:
streaming = df_streaming \
    .writeStream \
    .queryName(temp_table) \
    .outputMode('complete') \
    .format('memory') \
    .start()

# streaming = df \
#     .writeStream \
#     .queryName(temp_table) \
#     .outputMode('append') \
#     .format('memory') \
#     .start()

In [None]:
# Streams ativados
# spark.streams.active

In [None]:
# spark.sql(f"SELECT id_machine, avg_temperature, avg_rpm from {temp_table}").show()
spark.sql(f"SELECT * from {temp_table}").show()

In [None]:
for x in range(10):
    # spark.sql(f"SELECT id_machine, avg_temperature, avg_rpm from {temp_table}").show()
    spark.sql(f"SELECT id_machine, temperature, rpm, timestamp from {temp_table}").show()
    time.sleep(15)
    
streaming.stop()