In [None]:
import json
import os
from pathlib import Path
import uuid
import random
import time
from datetime import datetime, timezone

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType
import pyspark.sql.functions as F

PROJECT_PATH = Path.cwd().parent
DATA_DIR = '.data'
DATA_PATH = PROJECT_PATH / DATA_DIR
DATA_PATH.mkdir(exist_ok=True)

output_folder = str(DATA_PATH)
checkpoint_path = str(Path.cwd() / 'checkpoint')

file_schema = StructType() \
    .add('id', StringType()) \
    .add('temperature', DoubleType()) \
    .add('timestamp', TimestampType())

schema_name = 'dp700_e011'
table_name = 'temperature_stream'

spark = SparkSession.builder.appName('test').master('local[*]').getOrCreate()


In [None]:
# spark.sql(f'CREATE SCHEMA IF NOT EXISTS {schema_name}')


In [None]:
raw_stream_df = spark.readStream \
    .schema(file_schema) \
    .option('maxFilesPerTrigger', 1) \
    .json(output_folder)

transformed_stream_df = raw_stream_df \
    .withColumn('processed_timestamp',
        F.current_timestamp())

delta_stream = transformed_stream_df.writeStream \
    .format('memory') \
    .queryName("filtered") \
    .outputMode('append') \
    .option('checkpointLocation', checkpoint_path) \
    .start()

In [None]:
while delta_stream.isActive:
    print(delta_stream.status)
    print(delta_stream.lastProgress)
    time.sleep(5)
    

In [None]:
# df = spark.sql(f'SELECT * FROM {schema_name}.{table_name}')
df = spark.sql(f'SELECT * FROM filtered')
df.show()

In [None]:
delta_stream.stop()
spark.stop()