<a href="https://colab.research.google.com/github/carsofferrei/04_data_processing/blob/main/04_data_processing%20/spark_streaming/1_read_write_stream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read & Write Stream
- readStream()
- writeStream()
- Streaming Dataframe

# Setting up PySpark

In [1]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

# readStream with format "rate"
- readStream
- format("rate")

In [3]:
import pyspark.sql.functions as F

# read stream
stream = spark.readStream.format("rate").load()

In [4]:
type(stream)

In [5]:
# checking if it's streaming dataframe
stream.isStreaming

True

In [6]:
# should be false
data = [("c1", "v1"), ("c2", "v2")]
columns = ["col1", "col2"]
df = spark.createDataFrame(data, columns)
df.isStreaming

False

In [7]:
# apply normal dataframe operations
stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



In [8]:
# Queries with streaming sources must be executed with writeStream.start();
# stream.show()
stream.count()

AnalysisException: Queries with streaming sources must be executed with writeStream.start();
rate

# Transfor streaming dataframe

In [10]:
transformed = stream.withColumn("value2", F.col("value") * 2)

In [11]:
transformed.isStreaming

True

# write streaming dataframe - format memory
- writeStream
- format("memory")
- queryName
- outputMode
- start

In [12]:
query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report')
  .outputMode('append') # Para cada linha estou a adiciona-la essa nova linha no "producer" - que é o nosso rate.
  # Se eu tivesse uma agregação, o melhor seria aplicar um complete
  .start()
)

# Checking result table

In [13]:
# StreamingQuery
type(query)

In [49]:
import pyspark.sql.functions as F

# print(spark.table("rate_report").count())
# spark.table("rate_report").sort(F.col("timestamp").desc()).show(20, False)

# one line per second


print(spark.table("rate_report").count())
spark.table("rate_report").sort(F.col("timestamp").desc()).show(100)


948
+--------------------+-----+------+
|           timestamp|value|value2|
+--------------------+-----+------+
|2024-11-23 14:21:...|  947|  1894|
|2024-11-23 14:21:...|  946|  1892|
|2024-11-23 14:21:...|  945|  1890|
|2024-11-23 14:21:...|  944|  1888|
|2024-11-23 14:21:...|  943|  1886|
|2024-11-23 14:21:...|  942|  1884|
|2024-11-23 14:21:...|  941|  1882|
|2024-11-23 14:21:...|  940|  1880|
|2024-11-23 14:21:...|  939|  1878|
|2024-11-23 14:21:...|  938|  1876|
|2024-11-23 14:21:...|  937|  1874|
|2024-11-23 14:21:...|  936|  1872|
|2024-11-23 14:21:...|  935|  1870|
|2024-11-23 14:21:...|  934|  1868|
|2024-11-23 14:21:...|  933|  1866|
|2024-11-23 14:21:...|  932|  1864|
|2024-11-23 14:21:...|  931|  1862|
|2024-11-23 14:21:...|  930|  1860|
|2024-11-23 14:20:...|  929|  1858|
|2024-11-23 14:20:...|  928|  1856|
|2024-11-23 14:20:...|  927|  1854|
|2024-11-23 14:20:...|  926|  1852|
|2024-11-23 14:20:...|  925|  1850|
|2024-11-23 14:20:...|  924|  1848|
|2024-11-23 14:20:...|  

In [36]:
query.status
# Podemos tanto ter uma mensagem de 'Processing new data' como 'aiting for data to arrive'
# Quando falha costuma aparecer uma mensagem quando este comando corre

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [37]:
query.isActive

True

In [38]:
query.recentProgress

[{'id': '67e7f66c-c29d-491d-99c2-11dcd1e575d7',
  'runId': 'feba535d-ce69-451d-a919-1530c790af11',
  'name': 'rate_report',
  'timestamp': '2024-11-23T14:08:38.748Z',
  'batchId': 187,
  'numInputRows': 1,
  'inputRowsPerSecond': 100.0,
  'processedRowsPerSecond': 9.803921568627452,
  'durationMs': {'addBatch': 39,
   'commitOffsets': 29,
   'getBatch': 0,
   'latestOffset': 0,
   'queryPlanning': 6,
   'triggerExecution': 102,
   'walCommit': 28},
  'stateOperators': [],
  'sources': [{'description': 'RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default',
    'startOffset': 187,
    'endOffset': 188,
    'latestOffset': 188,
    'numInputRows': 1,
    'inputRowsPerSecond': 100.0,
    'processedRowsPerSecond': 9.803921568627452}],
  'sink': {'description': 'MemorySink', 'numOutputRows': 1}},
 {'id': '67e7f66c-c29d-491d-99c2-11dcd1e575d7',
  'runId': 'feba535d-ce69-451d-a919-1530c790af11',
  'name': 'rate_report',
  'timestamp': '2024-11-23T14:08:39.748Z',
  'batchId

In [39]:
query.lastProgress['batchId']

294

# Stop streaming

In [50]:
query.stop()

In [51]:
# awaitTermination


# Increase rows per second (rate)


In [52]:

# read stream
stream = spark.readStream.format("rate").option("rowsPerSecond", 20).load()

transformed = stream.withColumn("value2", F.col("value") * 2)

query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report_2')
  .outputMode('append')
  .start()
)


In [53]:
print(spark.table("rate_report_2").count())
spark.table("rate_report_2").show(100, False)

260
+----------------------+-----+------+
|timestamp             |value|value2|
+----------------------+-----+------+
|2024-11-23 14:21:32.11|0    |0     |
|2024-11-23 14:21:32.16|1    |2     |
|2024-11-23 14:21:32.21|2    |4     |
|2024-11-23 14:21:32.26|3    |6     |
|2024-11-23 14:21:32.31|4    |8     |
|2024-11-23 14:21:32.36|5    |10    |
|2024-11-23 14:21:32.41|6    |12    |
|2024-11-23 14:21:32.46|7    |14    |
|2024-11-23 14:21:32.51|8    |16    |
|2024-11-23 14:21:32.56|9    |18    |
|2024-11-23 14:21:32.61|10   |20    |
|2024-11-23 14:21:32.66|11   |22    |
|2024-11-23 14:21:32.71|12   |24    |
|2024-11-23 14:21:32.76|13   |26    |
|2024-11-23 14:21:32.81|14   |28    |
|2024-11-23 14:21:32.86|15   |30    |
|2024-11-23 14:21:32.91|16   |32    |
|2024-11-23 14:21:32.96|17   |34    |
|2024-11-23 14:21:33.01|18   |36    |
|2024-11-23 14:21:33.06|19   |38    |
|2024-11-23 14:21:33.11|20   |40    |
|2024-11-23 14:21:33.16|21   |42    |
|2024-11-23 14:21:33.21|22   |44    |
|2024-11

In [54]:
query.lastProgress['sources'][0]['numInputRows']

20

In [55]:
for batch in query.recentProgress:
  print(f"timestamp - {batch['timestamp']}")
  print(f"batchId - {batch['batchId']}")
  print(f"numInputRows - {batch['numInputRows']}")
  print("--")

timestamp - 2024-11-23T14:30:33.113Z
batchId - 541
numInputRows - 20
--
timestamp - 2024-11-23T14:30:34.119Z
batchId - 542
numInputRows - 20
--
timestamp - 2024-11-23T14:30:35.114Z
batchId - 543
numInputRows - 20
--
timestamp - 2024-11-23T14:30:36.120Z
batchId - 544
numInputRows - 20
--
timestamp - 2024-11-23T14:30:37.114Z
batchId - 545
numInputRows - 20
--
timestamp - 2024-11-23T14:30:38.116Z
batchId - 546
numInputRows - 20
--
timestamp - 2024-11-23T14:30:39.119Z
batchId - 547
numInputRows - 20
--
timestamp - 2024-11-23T14:30:40.110Z
batchId - 548
numInputRows - 20
--
timestamp - 2024-11-23T14:30:41.111Z
batchId - 549
numInputRows - 20
--
timestamp - 2024-11-23T14:30:42.120Z
batchId - 550
numInputRows - 20
--
timestamp - 2024-11-23T14:30:43.114Z
batchId - 551
numInputRows - 20
--
timestamp - 2024-11-23T14:30:44.110Z
batchId - 552
numInputRows - 20
--
timestamp - 2024-11-23T14:30:45.110Z
batchId - 553
numInputRows - 20
--
timestamp - 2024-11-23T14:30:46.119Z
batchId - 554
numInputRows 

In [56]:
query.stop()