<a href="https://colab.research.google.com/github/anaferreira744/DE-DP-ADF/blob/main/1_read_write_stream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read & Write Stream
- readStream()
- writeStream()
- Streaming Dataframe

# Setting up PySpark

In [1]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

# readStream with format "rate"
- readStream
- format("rate")

In [3]:
import pyspark.sql.functions as F

# read stream
stream = spark.readStream.format("rate").load()

In [4]:
type(stream)

In [5]:
# checking if it's streaming dataframe
stream.isStreaming

True

In [6]:
# should be false
data = [("c1", "v1"), ("c2", "v2")]
columns = ["col1", "col2"]
df = spark.createDataFrame(data, columns)
df.isStreaming

False

In [7]:
# apply normal dataframe operations
stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



In [8]:
# Queries with streaming sources must be executed with writeStream.start();
# stream.show()
stream.count()

AnalysisException: Queries with streaming sources must be executed with writeStream.start();
rate

# Transfor streaming dataframe

In [9]:
transformed = stream.withColumn("value2", F.col("value") * 2)

In [10]:
transformed.isStreaming

True

# write streaming dataframe - format memory
- writeStream
- format("memory")
- queryName
- outputMode
- start

In [11]:
query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report')
  .outputMode('append')
  .start()
)

# Checking result table

In [12]:
# StreamingQuery
type(query)

In [13]:
print(spark.table("rate_report").count())
spark.table("rate_report").show(20, False)

# one line per second

10
+-----------------------+-----+------+
|timestamp              |value|value2|
+-----------------------+-----+------+
|2024-11-23 14:01:32.311|0    |0     |
|2024-11-23 14:01:33.311|1    |2     |
|2024-11-23 14:01:34.311|2    |4     |
|2024-11-23 14:01:35.311|3    |6     |
|2024-11-23 14:01:36.311|4    |8     |
|2024-11-23 14:01:37.311|5    |10    |
|2024-11-23 14:01:38.311|6    |12    |
|2024-11-23 14:01:39.311|7    |14    |
|2024-11-23 14:01:40.311|8    |16    |
|2024-11-23 14:01:41.311|9    |18    |
|2024-11-23 14:01:42.311|10   |20    |
|2024-11-23 14:01:43.311|11   |22    |
|2024-11-23 14:01:44.311|12   |24    |
|2024-11-23 14:01:45.311|13   |26    |
+-----------------------+-----+------+



In [15]:
query.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [16]:
query.isActive

True

In [17]:
query.recentProgress

[{'id': '55282fdd-6243-4c69-bc54-469305a32070',
  'runId': '6e2029b1-7729-41b6-afdd-c25a09a5d817',
  'name': 'rate_report',
  'timestamp': '2024-11-23T14:08:11.311Z',
  'batchId': 398,
  'numInputRows': 1,
  'inputRowsPerSecond': 100.0,
  'processedRowsPerSecond': 11.76470588235294,
  'durationMs': {'addBatch': 25,
   'commitOffsets': 27,
   'getBatch': 0,
   'latestOffset': 0,
   'queryPlanning': 5,
   'triggerExecution': 85,
   'walCommit': 28},
  'stateOperators': [],
  'sources': [{'description': 'RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default',
    'startOffset': 398,
    'endOffset': 399,
    'latestOffset': 399,
    'numInputRows': 1,
    'inputRowsPerSecond': 100.0,
    'processedRowsPerSecond': 11.76470588235294}],
  'sink': {'description': 'MemorySink', 'numOutputRows': 1}},
 {'id': '55282fdd-6243-4c69-bc54-469305a32070',
  'runId': '6e2029b1-7729-41b6-afdd-c25a09a5d817',
  'name': 'rate_report',
  'timestamp': '2024-11-23T14:08:12.312Z',
  'batchId'

In [23]:
query.lastProgress['batchId']

588

# Stop streaming

In [21]:
query.stop()

In [28]:
spark.sql("select * from rate_report").count()

589

In [None]:
# awaitTermination


# Increase rows per second (rate)


In [24]:

# read stream
stream = spark.readStream.format("rate").option("rowsPerSecond", 20).load()

transformed = stream.withColumn("value2", F.col("value") * 2)

query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report_2')
  .outputMode('append')
  .start()
)


In [29]:
print(spark.table("rate_report_2").count())
spark.table("rate_report_2").show(100, False)

1480
+-----------------------+-----+------+
|timestamp              |value|value2|
+-----------------------+-----+------+
|2024-11-23 14:13:23.422|0    |0     |
|2024-11-23 14:13:23.472|1    |2     |
|2024-11-23 14:13:23.522|2    |4     |
|2024-11-23 14:13:23.572|3    |6     |
|2024-11-23 14:13:23.622|4    |8     |
|2024-11-23 14:13:23.672|5    |10    |
|2024-11-23 14:13:23.722|6    |12    |
|2024-11-23 14:13:23.772|7    |14    |
|2024-11-23 14:13:23.822|8    |16    |
|2024-11-23 14:13:23.872|9    |18    |
|2024-11-23 14:13:23.922|10   |20    |
|2024-11-23 14:13:23.972|11   |22    |
|2024-11-23 14:13:24.022|12   |24    |
|2024-11-23 14:13:24.072|13   |26    |
|2024-11-23 14:13:24.122|14   |28    |
|2024-11-23 14:13:24.172|15   |30    |
|2024-11-23 14:13:24.222|16   |32    |
|2024-11-23 14:13:24.272|17   |34    |
|2024-11-23 14:13:24.322|18   |36    |
|2024-11-23 14:13:24.372|19   |38    |
|2024-11-23 14:13:24.422|20   |40    |
|2024-11-23 14:13:24.472|21   |42    |
|2024-11-23 14:13:24

In [33]:
query.lastProgress['sources'][0]['numInputRows']

20

In [37]:
for batch in query.recentProgress:
  print(f"timestamp - {batch['timestamp']}")
  print(f"batchId - {batch['batchId']}")
  print(f"numInputRows - {batch['numInputRows']}")
  print("--")

timestamp - 2024-11-23T14:22:51.423Z
batchId - 568
numInputRows - 20
--
timestamp - 2024-11-23T14:22:52.428Z
batchId - 569
numInputRows - 20
--
timestamp - 2024-11-23T14:22:53.422Z
batchId - 570
numInputRows - 20
--
timestamp - 2024-11-23T14:22:54.429Z
batchId - 571
numInputRows - 20
--
timestamp - 2024-11-23T14:22:55.426Z
batchId - 572
numInputRows - 20
--
timestamp - 2024-11-23T14:22:56.424Z
batchId - 573
numInputRows - 20
--
timestamp - 2024-11-23T14:22:57.425Z
batchId - 574
numInputRows - 20
--
timestamp - 2024-11-23T14:22:58.430Z
batchId - 575
numInputRows - 20
--
timestamp - 2024-11-23T14:22:59.427Z
batchId - 576
numInputRows - 20
--
timestamp - 2024-11-23T14:23:00.430Z
batchId - 577
numInputRows - 20
--
timestamp - 2024-11-23T14:23:01.430Z
batchId - 578
numInputRows - 20
--
timestamp - 2024-11-23T14:23:02.426Z
batchId - 579
numInputRows - 20
--
timestamp - 2024-11-23T14:23:03.423Z
batchId - 580
numInputRows - 20
--
timestamp - 2024-11-23T14:23:04.427Z
batchId - 581
numInputRows 

In [38]:
query.stop()