<a href="https://colab.research.google.com/github/carsofferrei/04_data_processing/blob/main/04_data_processing/spark_streaming/1_read_write_stream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read & Write Stream
- readStream()
- writeStream()
- Streaming Dataframe

# Setting up PySpark

In [1]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

# readStream with format "rate"
- readStream
- format("rate")

In [3]:
import pyspark.sql.functions as F

# read stream
stream = spark.readStream.format("rate").load()

In [4]:
type(stream)

In [5]:
# checking if it's streaming dataframe
stream.isStreaming

True

In [6]:
# should be false
data = [("c1", "v1"), ("c2", "v2")]
columns = ["col1", "col2"]
df = spark.createDataFrame(data, columns)
df.isStreaming

False

In [7]:
# apply normal dataframe operations
stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



In [8]:
# Queries with streaming sources must be executed with writeStream.start();
# stream.show()
stream.count()

AnalysisException: Queries with streaming sources must be executed with writeStream.start();
rate

# Transfor streaming dataframe

In [10]:
transformed = stream.withColumn("value2", F.col("value") * 2)

In [11]:
transformed.isStreaming

True

# write streaming dataframe - format memory
- writeStream
- format("memory")
- queryName
- outputMode
- start

In [12]:
query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report')
  .outputMode('append') # Para cada linha estou a adiciona-la essa nova linha no "producer" - que é o nosso rate.
  # Se eu tivesse uma agregação, o melhor seria aplicar um complete
  .start()
)

# Checking result table

In [13]:
# StreamingQuery
type(query)

In [19]:
import pyspark.sql.functions as F

print(spark.table("rate_report").count())
spark.table("rate_report").sort(F.col("timestamp").desc()).show(20, False)

# one line per second

173
+----------------------+-----+------+
|timestamp             |value|value2|
+----------------------+-----+------+
|2024-11-23 14:08:22.74|172  |344   |
|2024-11-23 14:08:21.74|171  |342   |
|2024-11-23 14:08:20.74|170  |340   |
|2024-11-23 14:08:19.74|169  |338   |
|2024-11-23 14:08:18.74|168  |336   |
|2024-11-23 14:08:17.74|167  |334   |
|2024-11-23 14:08:16.74|166  |332   |
|2024-11-23 14:08:15.74|165  |330   |
|2024-11-23 14:08:14.74|164  |328   |
|2024-11-23 14:08:13.74|163  |326   |
|2024-11-23 14:08:12.74|162  |324   |
|2024-11-23 14:08:11.74|161  |322   |
|2024-11-23 14:08:10.74|160  |320   |
|2024-11-23 14:08:09.74|159  |318   |
|2024-11-23 14:08:08.74|158  |316   |
|2024-11-23 14:08:07.74|157  |314   |
|2024-11-23 14:08:06.74|156  |312   |
|2024-11-23 14:08:05.74|155  |310   |
|2024-11-23 14:08:04.74|154  |308   |
|2024-11-23 14:08:03.74|153  |306   |
+----------------------+-----+------+
only showing top 20 rows



In [36]:
query.status
# Podemos tanto ter uma mensagem de 'Processing new data' como 'aiting for data to arrive'

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [37]:
query.isActive

True

In [38]:
query.recentProgress

[{'id': '67e7f66c-c29d-491d-99c2-11dcd1e575d7',
  'runId': 'feba535d-ce69-451d-a919-1530c790af11',
  'name': 'rate_report',
  'timestamp': '2024-11-23T14:08:38.748Z',
  'batchId': 187,
  'numInputRows': 1,
  'inputRowsPerSecond': 100.0,
  'processedRowsPerSecond': 9.803921568627452,
  'durationMs': {'addBatch': 39,
   'commitOffsets': 29,
   'getBatch': 0,
   'latestOffset': 0,
   'queryPlanning': 6,
   'triggerExecution': 102,
   'walCommit': 28},
  'stateOperators': [],
  'sources': [{'description': 'RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default',
    'startOffset': 187,
    'endOffset': 188,
    'latestOffset': 188,
    'numInputRows': 1,
    'inputRowsPerSecond': 100.0,
    'processedRowsPerSecond': 9.803921568627452}],
  'sink': {'description': 'MemorySink', 'numOutputRows': 1}},
 {'id': '67e7f66c-c29d-491d-99c2-11dcd1e575d7',
  'runId': 'feba535d-ce69-451d-a919-1530c790af11',
  'name': 'rate_report',
  'timestamp': '2024-11-23T14:08:39.748Z',
  'batchId

In [39]:
query.lastProgress['batchId']

294

# Stop streaming

In [None]:
query.stop()

In [None]:
# awaitTermination


# Increase rows per second (rate)


In [None]:

# read stream
stream = spark.readStream.format("rate").option("rowsPerSecond", 20).load()

transformed = stream.withColumn("value2", F.col("value") * 2)

query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report_2')
  .outputMode('append')
  .start()
)


In [None]:
print(spark.table("rate_report_2").count())
spark.table("rate_report_2").show(100, False)

740
+-----------------------+-----+------+
|timestamp              |value|value2|
+-----------------------+-----+------+
|2024-11-22 15:16:27.485|0    |0     |
|2024-11-22 15:16:27.535|1    |2     |
|2024-11-22 15:16:27.585|2    |4     |
|2024-11-22 15:16:27.635|3    |6     |
|2024-11-22 15:16:27.685|4    |8     |
|2024-11-22 15:16:27.735|5    |10    |
|2024-11-22 15:16:27.785|6    |12    |
|2024-11-22 15:16:27.835|7    |14    |
|2024-11-22 15:16:27.885|8    |16    |
|2024-11-22 15:16:27.935|9    |18    |
|2024-11-22 15:16:27.985|10   |20    |
|2024-11-22 15:16:28.035|11   |22    |
|2024-11-22 15:16:28.085|12   |24    |
|2024-11-22 15:16:28.135|13   |26    |
|2024-11-22 15:16:28.185|14   |28    |
|2024-11-22 15:16:28.235|15   |30    |
|2024-11-22 15:16:28.285|16   |32    |
|2024-11-22 15:16:28.335|17   |34    |
|2024-11-22 15:16:28.385|18   |36    |
|2024-11-22 15:16:28.435|19   |38    |
|2024-11-22 15:16:28.485|20   |40    |
|2024-11-22 15:16:28.535|21   |42    |
|2024-11-22 15:16:28.

In [None]:
query.lastProgress['sources'][0]['numInputRows']

20

In [None]:
for batch in query.recentProgress:
  print(f"timestamp - {batch['timestamp']}")
  print(f"batchId - {batch['batchId']}")
  print(f"numInputRows - {batch['numInputRows']}")
  print("--")

In [None]:
query.stop()