In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import Window
from delta.tables import DeltaTable

# Create SparkSession
spark = SparkSession.builder.appName("DeltaSession") \
            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
            .getOrCreate()

In [5]:
streamingDf = spark.readStream.format("rate").load()

In [6]:
streamingDf.isStreaming

True

In [7]:
streamingDf.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



## Write

In [8]:
stream = (streamingDf.selectExpr("value as id")
                      .writeStream
                      .format("delta")
                      .option("checkpointLocation", "extract/11streaming")
                      .start("extract/11streaming")
         )

In [None]:
stream.isActive

In [None]:
stream.stop()

In [None]:
stream.isActive

## Read

In [9]:
(streamingDf.writeStream
     .format("console")
     .outputMode("append")
     .start()
     .awaitTermination() 
)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/p

KeyboardInterrupt: 

In [None]:
(spark.readStream
     .format("delta")
     .load("extract/streaming")
     .writeStream
     .outputMode("append")
     .format("console")
     .start()
)

In [None]:
stream2 = (spark.readStream
                 .format("delta")
                 .load("extract/streaming")
                 .writeStream
                 .format("console")
                 .start()
            )

stream2.awaitTermination()

In [None]:
stream2.show()

In [None]:
https://github.com/delta-io/delta/blob/master/examples/python/streaming.py

# Spark Streaming

In [24]:
stream_df.isStreaming

False

In [25]:
stream_df.printSchema()

root
 |-- CPF: string (nullable = true)
 |-- PhoneNumber: string (nullable = true)
 |-- __op: string (nullable = true)
 |-- __table: string (nullable = true)
 |-- __source_ts_ms: integer (nullable = true)
 |-- __deleted: string (nullable = true)



In [26]:
#stream_df.limit(3).show()

In [27]:
(stream_df.writeStream
        .format('parquet')
        .option("checkpointLocation", checkpoint)
        .option('path', path)
        .start()
)

AttributeError: 'DataFrameWriter' object has no attribute 'start'

In [28]:
(stream_df.write
        .format('parquet')
        .option("checkpointLocation", checkpoint)
        .option('path', path)
)

<pyspark.sql.readwriter.DataFrameWriter at 0x7ff1dccd2d30>

In [20]:
(stream_df.writeStream
    .outputMode('append')
    .option("truncate", False)
    .option("numRows","3")
    .option("partition.assignment.strategy", "RangeAssignor")
    .format('console')
    .start()
    .awaitTermination(timeout=300)
)

StreamingQueryException: 'Missing required configuration "partition.assignment.strategy" which has no default value.\n=== Streaming Query ===\nIdentifier: [id = 858a75e1-4054-411a-a644-61fc5524feb5, runId = 26d912db-3559-445f-b58e-b95f346003ea]\nCurrent Committed Offsets: {}\nCurrent Available Offsets: {}\n\nCurrent State: ACTIVE\nThread State: RUNNABLE\n\nLogical Plan:\nProject [data#125.CPF AS CPF#127, data#125.PhoneNumber AS PhoneNumber#128, data#125.__op AS __op#129, data#125.__table AS __table#130, data#125.__source_ts_ms AS __source_ts_ms#131, data#125.__deleted AS __deleted#132]\n+- Project [jsontostructs(StructField(CPF,StringType,true), StructField(PhoneNumber,StringType,true), StructField(__op,StringType,true), StructField(__table,StringType,true), StructField(__source_ts_ms,IntegerType,true), StructField(__deleted,StringType,true), cast(value#112 as string), Some(Etc/UTC)) AS data#125]\n   +- StreamingExecutionRelation KafkaV2[Subscribe[gact4.dbo.driver]], [key#111, value#112, topic#113, partition#114, offset#115L, timestamp#116, timestampType#117]\n'

In [8]:
# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": F.col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": F.col("newData.id") }) \
  .execute()

In [14]:
# Incremental Loads
if DeltaTable.isDeltaTable(spark, path):
    query = stream_df.writeStream \
        .format("delta") \
        .foreachBatch(upsertToDelta) \
        .outputMode("update") \
        .start(path) 
    
# First load
else:
    query = stream_df.writeStream
        .format("delta") \
        .outputMode("append") \
        .option("checkpointLocation", "checkpoint") \
        .start()

    print(write_stream_table_col.lastProgress)
    print(write_stream_table_col.status)
    write_stream_table_col.awaitTermination()

In [10]:
path = 'abfss://raizen@raizenadls.dfs.core.windows.net/SANDBOX/CS371880/DELTA/'

spark.read.format("delta").load(path).orderBy('id').toPandas()

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9
