In [22]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("s8a-streaming").getOrCreate()

In [23]:
# Le indicamos el esquema pq puede ser que la carpeta esté vacía y no pueda inferirlo

In [24]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
esquema = StructType([
    StructField("action", StringType(), False),
    StructField("id", StringType(), False),
    StructField("ts", StringType(), True)
])

In [25]:
df = spark.read.schema(esquema).json("ejercicios/mobile")

In [26]:
df.printSchema()

root
 |-- action: string (nullable = true)
 |-- id: string (nullable = true)
 |-- ts: string (nullable = true)



In [29]:
df.isStreaming

False

In [30]:
# source
df = spark.readStream.schema(esquema).json("ejercicios/mobile")

In [31]:
df.isStreaming

True

In [33]:
# perform a group by using event time of column ts and fixed window of 10 mins
from pyspark.sql.functions import window 
actionCountDF = df.groupBy(window("ts","10 minutes"), "action").count()

In [36]:
# sink
# the start() function triggers the Spark Structured Streaming engine to start 
# watching the mobile folder and start processing the data once it sees new files in that folder
mobileConsoleSink = actionCountDF.writeStream.format("console").option("truncate", "false").outputMode("complete").start()

22/04/09 20:16:48 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/p3/9_2bsgf50ps_4hf29pt9bbc80000gn/T/temporary-2e51f7b4-0f8e-4c1a-bc83-a85b6ea4b27a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/04/09 20:16:48 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+------+-----+
|window                                    |action|count|
+------------------------------------------+------+-----+
|{2018-03-02 10:00:00, 2018-03-02 10:10:00}|close |1    |
|{2018-03-02 10:00:00, 2018-03-02 10:10:00}|open  |3    |
+------------------------------------------+------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+------+-----+
|window                                    |action|count|
+------------------------------------------+------+-----+
|{2018-03-02 10:00:00, 2018-03-02 10:10:00}|close |2    |
|{2018-03-02 10:00:00, 2018-03-02 10:10:00}|open  |4    |
+------------------------------------------+------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+------+-----+
|window                                    |action|count|
+------------------------------------------+------+-----+
|{2018-03-02 10:10:00, 2018-03-02 10:20:00}|open  |1    |
|{2018-03-02 10:00:00, 2018-03-02 10:10:00}|close |3    |
|{2018-03-02 10:00:00, 2018-03-02 10:10:00}|open  |4    |
+------------------------------------------+------+-----+



In [37]:
mobileConsoleSink.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [38]:
mobileConsoleSink.lastProgress

{'id': 'a3a47e95-5330-4e83-b9e3-2b5aaff96de5',
 'runId': 'be7b871e-a1dd-4390-ad00-99bc6b2eed30',
 'name': None,
 'timestamp': '2022-04-09T18:19:25.495Z',
 'batchId': 3,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'latestOffset': 2, 'triggerExecution': 2},
 'stateOperators': [{'operatorName': 'stateStoreSave',
   'numRowsTotal': 3,
   'numRowsUpdated': 0,
   'allUpdatesTimeMs': 606,
   'numRowsRemoved': 0,
   'allRemovalsTimeMs': 0,
   'commitTimeMs': 35787,
   'memoryUsedBytes': 87184,
   'numRowsDroppedByWatermark': 0,
   'numShufflePartitions': 200,
   'numStateStoreInstances': 200,
   'customMetrics': {'loadedMapCacheHitCount': 800,
    'loadedMapCacheMissCount': 0,
    'stateOnCurrentVersionSizeBytes': 21448}}],
 'sources': [{'description': "FileStreamSource[file:/Users/aitormedrano/OneDrive - Conselleria d'Educació/2122/bigdata/bigdata2122/docs/recursos/spark/ejercicios/mobile]",
   'startOffset': {'logOffset': 2},
   'endOffset'

En producción y aplicaciones de ejecución ininterrumpida, hay que llamar a 
StreamingQuery.awaitTermination()

In [39]:
mobileConsoleSink.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/py4j-0.10.9.3-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/py4j-0.10.9.3-src.zip/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/Cellar/python@3.9/3.9.12/Frameworks/Python.framework/Versions/3.9/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [40]:
mobileConsoleSink.stop()

In [58]:
from pyspark.sql.functions import col, upper 

df = spark.readStream.schema(esquema).json("ejercicios/mobile")

cleanMobileSSDF = df.filter((col("action") == 'open') | (col("action") == 'close')).select("id", upper("action"), "ts")
cleanMobileSSDF.createOrReplaceTempView("clean_mobile")
spark.sql("select count(*) from clean_mobile")


mobileSQ =  cleanMobileSSDF.writeStream.outputMode("append").format("console").start()

22/04/09 20:39:07 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/p3/9_2bsgf50ps_4hf29pt9bbc80000gn/T/temporary-8cf68967-5e64-4c5b-906b-20ded1d96ca6. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/04/09 20:39:07 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


-------------------------------------------
Batch: 0
-------------------------------------------
+------+-------------+-------------------+
|    id|upper(action)|                 ts|
+------+-------------+-------------------+
|phone1|         OPEN|2018-03-02T10:02:33|
|phone2|         OPEN|2018-03-02T10:03:35|
|phone3|         OPEN|2018-03-02T10:03:50|
|phone1|        CLOSE|2018-03-02T10:04:35|
|phone3|        CLOSE|2018-03-02T10:07:35|
|phone4|         OPEN|2018-03-02T10:07:50|
+------+-------------+-------------------+

-------------------------------------------
Batch: 2
-------------------------------------------
+------+------+-------------------+
|action|    id|                 ts|
+------+------+-------------------+
| close|phone2|2018-03-02T10:04:50|
|  open|phone5|2018-03-02T10:10:50|
+------+------+-------------------+

-------------------------------------------
Batch: 1
-------------------------------------------
+------+-------------+-------------------+
|    id|upper(acti

In [63]:
mobileSQ.stop()