In [None]:
import warnings
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import Window
from delta.tables import DeltaTable

warnings.filterwarnings("ignore", category=FutureWarning)

# Create SparkSession
spark = SparkSession.builder.appName("DeltaSession") \
            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
            .getOrCreate()

## SQL - Create DeltaTable

In [None]:
spark.sql("""
CREATE TABLE IF NOT EXISTS 02deltatable2 (
  id INT,
  name STRING,
  age INT,
  city STRING
) USING DELTA
""")

In [None]:
spark.sql("SHOW TABLES").toPandas()

In [None]:
spark.sql("SHOW TBLPROPERTIES 02deltatable2").toPandas()

In [None]:
spark.sql("SELECT * FROM 02deltatable2").toPandas()

## Python - Create DeltaTable

4 options:
- create
- createIfNotExists
- replace
- createOrReplace

## [create](https://docs.delta.io/latest/api/python/index.html#delta.tables.DeltaTable.create)

classmethod create(sparkSession: Optional[pyspark.sql.session.SparkSession] = None) → delta.tables.DeltaTableBuilder

Return DeltaTableBuilder object that can be used to specify the table name, location, columns, partitioning columns, table comment, and table properties to create a Delta table, error if the table exists (the same as SQL CREATE TABLE).

Parameters:
- sparkSession – SparkSession to use for creating the table

## [createIfNotExists](https://docs.delta.io/latest/api/python/index.html#delta.tables.DeltaTable.createIfNotExists)

classmethod createIfNotExists(sparkSession: Optional[pyspark.sql.session.SparkSession] = None) → delta.tables.DeltaTableBuilder

Return DeltaTableBuilder object that can be used to specify the table name, location, columns, partitioning columns, table comment, and table properties to create a Delta table, if it does not exists (the same as SQL CREATE TABLE IF NOT EXISTS).

Parameters:
- sparkSession – SparkSession to use for creating the table

## [replace](https://docs.delta.io/latest/api/python/index.html#delta.tables.DeltaTable.replace)

classmethod replace(sparkSession: Optional[pyspark.sql.session.SparkSession] = None) → delta.tables.DeltaTableBuilder

Return DeltaTableBuilder object that can be used to specify the table name, location, columns, partitioning columns, table comment, and table properties to replace a Delta table, error if the table doesn’t exist (the same as SQL REPLACE TABLE).

Parameters:
- sparkSession – SparkSession to use for creating the table

## [createOrReplace](https://docs.delta.io/latest/api/python/index.html#delta.tables.DeltaTable.createOrReplace)

classmethod createOrReplace(sparkSession: Optional[pyspark.sql.session.SparkSession] = None) → delta.tables.DeltaTableBuilder

Return DeltaTableBuilder object that can be used to specify the table name, location, columns, partitioning columns, table comment, and table properties replace a Delta table, error if the table doesn’t exist (the same as SQL REPLACE TABLE).

See DeltaTableBuilder for a full description and examples of this operation.

Parameters:
- sparkSession – SparkSession to use for creating the table


## [tableName](https://docs.delta.io/latest/api/python/index.html#delta.tables.DeltaTableBuilder.tableName)

tableName(identifier: str) → delta.tables.DeltaTableBuilder

Specify the table name. Optionally qualified with a database name [database_name.] table_name.

Parameters:
- identifier (str) – the table name

## [location](https://docs.delta.io/latest/api/python/index.html#delta.tables.DeltaTableBuilder.location)

location(location: str) → delta.tables.DeltaTableBuilder

Specify the path to the directory where table data is stored, which could be a path on distributed storage.

Parameters:
-location (str) – the data stored location

## [comment](https://docs.delta.io/latest/api/python/index.html#delta.tables.DeltaTableBuilder.comment)

comment(comment: str) → delta.tables.DeltaTableBuilder

Comment to describe the table.

Parameters:
- comment (str) – the table comment

## [addColumn](https://docs.delta.io/latest/api/python/index.html#delta.tables.DeltaTableBuilder.addColumn)

addColumn(colName: str, dataType: Union[str, pyspark.sql.types.DataType], nullable: bool = True, generatedAlwaysAs: Optional[str] = None, comment: Optional[str] = None) → delta.tables.DeltaTableBuilder

Specify a column in the table

Parameters:
- colName (str) – the column name
- dataType (str or pyspark.sql.types.DataType) – the column data type
- nullable (bool) – whether column is nullable
- generatedAlwaysAs (str) – a SQL expression if the column is always generated as a function of other columns. See online documentation for details on Generated Columns.
- comment (str) – the column comment

## [partitionedBy](https://docs.delta.io/latest/api/python/index.html#delta.tables.DeltaTableBuilder.partitionedBy)

partitionedBy(*cols) → delta.tables.DeltaTableBuilder

Specify columns for partitioning

Parameters:
-cols (str or list name of columns) – the partitioning cols

## [property](https://docs.delta.io/latest/api/python/index.html#delta.tables.DeltaTableBuilder.property)

property(key: str, value: str) → delta.tables.DeltaTableBuilder

Specify a table property

Parameters:
- key – the table property key

## [execute](https://docs.delta.io/latest/api/python/index.html#delta.tables.DeltaTableBuilder.execute)
execute() → delta.tables.DeltaTable

Execute Table Creation.

In [None]:
(DeltaTable
    .createOrReplace(spark)
    .tableName("02deltatable3")
    .addColumn("id", "INT")
    .addColumn("name", "STRING")
    .addColumn("age", "INT")
    .addColumn("city", "STRING", comment = "cityWorld")
    .property("description", "table with people data")
    .property("abc", "123")
    .partitionedBy("id")
    .execute()
)

In [None]:
spark.sql("SHOW TABLES").toPandas()

In [None]:
spark.sql("SHOW TBLPROPERTIES 02deltatable3").toPandas()

In [None]:
spark.sql("DESCRIBE 02deltatable3").toPandas()

In [None]:
spark.sql("SELECT * FROM 02deltatable3").toPandas()

In [None]:
spark.sql("""
INSERT INTO 02deltatable3 (id, name, age, city)
VALUES (1, 'Marcelo', 5, 'Sao Paulo')
""")

In [None]:
spark.sql("SELECT * FROM 02deltatable3").toPandas()

## Create Table Like

In [None]:
spark.sql("CREATE TABLE 02deltatable3like LIKE 02deltatable3")

In [None]:
spark.sql("DESCRIBE 02deltatable3like").toPandas()

In [None]:
spark.sql("SELECT * FROM 02deltatable3like").toPandas()

## Shallow Clone

In [None]:
spark.sql("CREATE TABLE 02deltatableclone SHALLOW CLONE 02deltatable3").toPandas()

In [None]:
spark.sql("SELECT * FROM 02deltatable3").toPandas()

In [None]:
dt = DeltaTable.forName(spark, "02deltatableclone")
dt.toDF().toPandas()

In [None]:
dt.history().toPandas()

In [None]:
spark.sql("""
INSERT INTO 02deltatable3 (id, name, age, city)
VALUES (2, 'Velero', 15, 'Sao Paulo')
""")

In [None]:
spark.sql("SELECT * FROM 02deltatable3").toPandas()

In [None]:
dt = DeltaTable.forName(spark, "02deltatableclone")
dt.toDF().toPandas()

In [None]:
spark.sql("""
INSERT INTO 02deltatableclone (id, name, age, city)
VALUES (4, 'Ted', 54, 'Sao Paulo')
""")

In [None]:
dt = DeltaTable.forName(spark, "02deltatableclone")
dt.toDF().toPandas()

In [None]:
dt.history().toPandas()

## Drop

In [None]:
spark.sql("SHOW TABLES").toPandas()

In [None]:
spark.sql("DROP TABLE IF EXISTS `02deltatable3`")

In [None]:
spark.sql("SHOW TABLES").toPandas()

In [None]:
spark.sql("SELECT * FROM 02deltatableclone").toPandas()

## Generated Columns

In [None]:
(DeltaTable
    .create(spark)
    .tableName("02deltatable4")
    .addColumn("id", "INT")
    .addColumn("name", "STRING")
    .addColumn("age", "INT")
    .addColumn("city", "STRING", comment = "cityWorld")
    .addColumn("country", "STRING", generatedAlwaysAs="'Brazil'")
    .addColumn("age-5", "INT", generatedAlwaysAs="age - 5")
    .addColumn("name-city", "STRING", generatedAlwaysAs="CONCAT('name', '-','city')")
    .addColumn("age_string", "STRING", generatedAlwaysAs="CAST(age as STRING)")
    .property("description", "table with people data")
    .execute()
)

In [None]:
DeltaTable.forName(spark, "02deltatable4").toDF().toPandas()

In [None]:
# Define the schema for a DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True)
])

# Create a DataFrame using the schema
data = [(1, "Alice", 25, "New York"), 
        (2, "Bob", 30, "San Francisco"), 
        (3, None, 35, "Chicago")]

df = spark.createDataFrame(data, schema)

df.toPandas()

In [None]:
df.write.mode("append").format("delta").saveAsTable("02deltatable4")

In [None]:
spark.sql("SELECT * FROM 02deltatable4").toPandas()

In [None]:
spark.sql("DESCRIBE 02deltatable4").toPandas()