In [4]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
_path = "/Users/alexandrvolok/repos/spalah/tmp/range2"


spark.range(0,2).write.format("delta").mode("overwrite").save(_path)


                                                                                

In [78]:
properties_to_set = {
    "delta.logRetentionDuration": "interval 10 days",
     "delta.deletedFileRetentionDuration": "interval 16 days"
}


In [28]:
_dt_ref = DeltaTable.forPath(path=_path, sparkSession=spark)

In [29]:
spark.sql("select * from delta.`/Users/alexandrvolok/repos/spalah/tmp/range2`")

DataFrame[id: bigint]

In [17]:
from delta import *

def set_table_properties(
    properties: dict,
    table_path: str = '',
    table_name: str = '',    
    allow_unset: bool = False
) -> None:
    """Sets and unsets pyspark table properties. If the property already 
    set with a requested value ALTER TABLE will not be triggered again

    Args:
        properties (dict):              A dictionary with properties to set. 
                                        For instance: {"delta.logRetentionDuration": "interval 10 days"}
        table_path (str, optional):     Path to delta table Defaults to ''.
        table_name (str, optional):     Delta table name. For instance: db1.table1. Defaults to ''.
        allow_unset (bool, optional):   When enabled, properties which are not defined in 'properties' 
                                        but set on table will be unset. Defaults to False.

    Raises:
        ValueError: if values for both 'table_path' and 'table_name' provided
                    provide values to one of them
        ValueError: if values for neither 'table_path' nor 'table_name' provided
                    provide values to one of them
    """    
    if table_path and table_name:
        raise ValueError("Both 'table_path' and 'table_name' provided. Use one of them.")

    if table_path and table_name:
        raise ValueError("Neither 'table_path' nor 'table_name' defined. Use one of them.")

    if table_path:
        table_name = f"delta.`{_path}`"

    if  table_path:
        _delta_table = DeltaTable.forPath(path=_path, sparkSession=spark)
    else:
        _delta_table = DeltaTable.forName(tableOrViewName=table_name, sparkSession=spark)

    if not _delta_table:
        print(f"{table_name} is not a Delta Table")
    else:
        _existing_properties = _delta_table.detail().collect()[0].asDict()["properties"]
        
        print(f"Applying table properties on '{table_name}':")

        for k,v in properties.items():
            
            print(f" - Checking if '{k} = {v}' is set on {table_name}")

            if k in _existing_properties and _existing_properties[k] == v:            
                print ("   Result: The property already exists on the table")
            else:
                _sql = f"ALTER TABLE {table_name} SET TBLPROPERTIES ({k} = '{v}')"
                spark.sql(_sql)
                print (f"   Result: The property has been set")

        if allow_unset:
            for k, v in _existing_properties.items():
                if k not in properties:
                    _sql = f"ALTER TABLE {table_name} UNSET TBLPROPERTIES ({k})"
                    spark.sql(_sql)
                    print (
                        f"   The property '{k} = {v}' has been unset because it is not defined in "
                        "the original dict"
                    )


In [None]:
properties_to_set = {
    "delta.logRetentionDuration": "interval 10 days",
     "delta.deletedFileRetentionDuration": "interval 17 days"
}



In [16]:
set_table_properties(
    table_path='/Users/alexandrvolok/repos/spalah/tmp/range2',
    properties={
        "delta.logRetentionDuration": "interval 10 days",
        "delta.deletedFileRetentionDuration": "interval 15 days"
    },
    allow_unset=True
)

Applying table properties on 'delta.`/Users/alexandrvolok/repos/spalah/tmp/range2`':
 - Checking if 'delta.logRetentionDuration = interval 10 days' is set on delta.`/Users/alexandrvolok/repos/spalah/tmp/range2`
   Result: The property already exists on the table
 - Checking if 'delta.deletedFileRetentionDuration = interval 15 days' is set on delta.`/Users/alexandrvolok/repos/spalah/tmp/range2`
   Result: The property has been set


In [18]:
from typing import Union
from pyspark.sql import SparkSession

def get_table_properties(        
    table_path: str = '',
    table_name: str = '',        
    spark_session:Union[SparkSession,None] = None
) -> dict:
    """Gets pyspark table properties.

    Args:        
        table_path (str, optional):     Path to delta table Defaults to ''.
        table_name (str, optional):     Delta table name. For instance: db1.table1. Defaults to ''.        
        spark_session: (SparkSession, optional)  The current spark context. 
                                                 If not defined the getActiveSession() will be used

    Raises:
        ValueError: if values for both 'table_path' and 'table_name' provided
                    provide values to one of them
        ValueError: if values for neither 'table_path' nor 'table_name' provided
                    provide values to one of them
    """

    _existing_properties = dict()
    
    if table_path and table_name:
        raise ValueError("Both 'table_path' and 'table_name' provided. Use one of them.")

    if table_path and table_name:
        raise ValueError("Neither 'table_path' nor 'table_name' defined. Use one of them.")


    if not spark_session:
        spark_session = SparkSession.getActiveSession()

    if table_path:
        table_name = f"delta.`{table_path}`"

    if  table_path:
        _delta_table = DeltaTable.forPath(path=table_path, sparkSession=spark_session)
    else:
        _delta_table = DeltaTable.forName(tableOrViewName=table_name, sparkSession=spark_session)

    if not _delta_table:
        print(f"{table_name} is not a Delta Table")
    else:
        _existing_properties = _delta_table.detail().collect()[0].asDict()["properties"]
        
    return _existing_properties

In [19]:
get_table_properties(table_path="/Users/alexandrvolok/repos/spalah/tmp/range2")

{'delta.deletedFileRetentionDuration': 'interval 15 days',
 'delta.logRetentionDuration': 'interval 10 days'}

In [5]:
from spalah.datalake import get_table_properties as g2

In [6]:
g2(table_path="abc")

delta.`abc` is not a Delta Table


22/09/30 21:27:25 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 121536 ms exceeds timeout 120000 ms
22/09/30 21:27:25 WARN SparkContext: Killing executors is not supported by current scheduler.


In [3]:
from delta import *

In [6]:
DeltaTable.isDeltaTable(sparkSession=spark, identifier="abc")

NameError: name 'spark' is not defined