In [15]:
import json
import os
from pathlib import Path
import uuid
import random
import time
from datetime import datetime, timezone
import shutil

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType
import pyspark.sql.functions as F

PROJECT_PATH = Path.cwd().parent
DATA_DIR = '.data'
DATA_PATH = PROJECT_PATH / DATA_DIR
DATA_PATH.mkdir(exist_ok=True)

output_folder = str(DATA_PATH)
checkpoint_path = str(Path.cwd() / 'checkpoint')
shutil.rmtree(checkpoint_path, True)
output_path = str(Path.cwd() / 'test.csv')
shutil.rmtree(output_path, True)

file_schema = StructType() \
    .add('id', StringType()) \
    .add('temperature', DoubleType()) \
    .add('timestamp', TimestampType())

schema_name = 'dp700_e011'
table_name = 'temperature_stream'


In [16]:

spark = SparkSession.builder.appName('test').master('local[*]').getOrCreate()


In [17]:
# spark.sql(f'CREATE SCHEMA IF NOT EXISTS {schema_name}')


In [18]:
raw_stream_df = spark.readStream \
    .schema(file_schema) \
    .option('maxFilesPerTrigger', 1) \
    .json(output_folder)

transformed_stream_df = raw_stream_df \
    .withColumn('processed_timestamp',
        F.current_timestamp())

delta_stream = transformed_stream_df.writeStream \
    .format('memory') \
    .queryName("filtered") \
    .outputMode('append') \
    .option('checkpointLocation', checkpoint_path) \
    .start(output_path)

25/08/21 18:06:02 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [19]:
delta_stream.processAllAvailable()


# while delta_stream.isActive:
#     print(delta_stream.status)
#     print(delta_stream.lastProgress)
#     time.sleep(5)
    

In [20]:
# df = spark.sql(f'SELECT * FROM {schema_name}.{table_name}')
df = spark.sql(f'SELECT * FROM filtered')
df.show(truncate=False)

+------------------------------------+-----------+--------------------------+-----------------------+
|id                                  |temperature|timestamp                 |processed_timestamp    |
+------------------------------------+-----------+--------------------------+-----------------------+
|69f36f78-8be6-4411-8676-224ce615d58e|18.97      |2025-08-21 18:05:11.485252|2025-08-21 18:06:02.39 |
|f7fefba8-23a3-4576-8ef1-d929435aae4e|21.57      |2025-08-21 18:05:16.487076|2025-08-21 18:06:02.495|
|bc63ba8c-9b62-4ceb-a6c7-ae9428a16634|25.47      |2025-08-21 18:05:21.499135|2025-08-21 18:06:02.585|
|1a5ea9b5-c8b0-40d1-a284-574565065fb0|25.47      |2025-08-21 18:05:26.50459 |2025-08-21 18:06:02.671|
|860ec31e-8443-48c7-b739-c1e67a110541|22.39      |2025-08-21 18:05:31.509125|2025-08-21 18:06:02.755|
|7aa49303-59e2-4142-a3c8-9c2fdefa9dac|20.97      |2025-08-21 18:05:36.513156|2025-08-21 18:06:02.866|
|bc602119-4d5e-4cce-a477-e09a5ec3aedc|19.87      |2025-08-21 18:05:41.51665 |2025-

In [21]:
delta_stream.stop()
spark.stop()