In [0]:
from typing import List, Dict, Any, Optional
from pyspark.sql import DataFrame

class DataSink:
    """
    Abstract base class for data sinks. This class defines the common structure 
    for any data sink that will load a DataFrame to a specified location (path) 
    using a particular method (e.g., 'overwrite', 'append').
    Subclasses must implement the `load_data_frame` method.
    """

    def __init__(self, path: str, method: str, params: Dict[str, Any], df: DataFrame):
        """
        Initializes the DataSink class with the required attributes.
        
        Parameters:
        path (str): The file path or destination where the data will be loaded.
        method (str): The method used to load the data, e.g., 'overwrite' or 'append'.
        params (dict): Additional parameters that may be required for data loading.
        df (DataFrame): The DataFrame to be loaded.
        """
        self.path = path          # The destination path for the DataFrame
        self.method = method      # The method for loading the data (e.g., 'overwrite')
        self.params = params      # Additional parameters for loading
        self.df = df              # The DataFrame to be loaded

    def load_data_frame(self) -> None:
        """
        Abstract method that should be implemented by subclasses. 
        This method will handle the logic for loading the DataFrame to the sink.
        
        Raises:
        ValueError: If the method is not defined in a subclass.
        """
        raise ValueError("Method not defined")


In [0]:

class LoadToADLSGEN2WithPartition(DataSink):
    """
    A concrete implementation of the DataSink class for loading data into 
    Azure Data Lake Storage Gen2 (ADLS Gen2) with partitioning.
    This class partitions the data based on specified columns before saving it.
    """

    def load_data_frame(self) -> None:
        """
        Loads the DataFrame into ADLS Gen2 with partitioning. 
        It retrieves the partition columns from the parameters, then writes 
        the data to the specified path using the given write mode.
        
        Parameters:
        None
        
        Returns:
        None
        """
        # Get the partition columns from the parameters dictionary
        partitionByColumnsList: List[str] = self.params.get("partitionByColumns")
        
        # Write the DataFrame with partitioning and save to the specified path
        self.df.write.format("parquet").mode(self.method).partitionBy(*partitionByColumnsList).save(self.path)


class LoadToADLSGEN2(DataSink):
    """
    A concrete implementation of the DataSink class for loading data into 
    Azure Data Lake Storage Gen2 (ADLS Gen2) without partitioning.
    This class simply saves the data without any partitioning.
    """

    def load_data_frame(self) -> None:
        """
        Loads the DataFrame into ADLS Gen2 without partitioning. 
        It writes the data to the specified path using the given write mode.
        
        Parameters:
        None
        
        Returns:
        None
        """
        # Write the DataFrame and save to the specified path without partitioning
        self.df.write.format("parquet").mode(self.method).save(self.path)


class LoadToADLSGEN2WithDelta(DataSink):
    """
    A concrete implementation of the DataSink class for loading data into 
    Azure Data Lake Storage Gen2 (ADLS Gen2) using the Delta format.
    This class saves the data in Delta format and allows schema overwriting.
    """

    def load_data_frame(self) -> None:
        """
        Loads the DataFrame into ADLS Gen2 in Delta format. 
        It writes the data with schema overwriting enabled and saves it using 
        the Delta format to the specified path.
        
        Parameters:
        None
        
        Returns:
        None
        """
        # Write the DataFrame in Delta format with schema overwriting, and save to the specified path
        self.df.write.format("delta").mode(self.method).option("overwriteSchema", True).save(self.path)


In [0]:


def get_data_sink(sink_type: str, df: DataFrame, path: str, method: str, params: Optional[dict] = None) -> DataSink:
    """
    Factory function to return the appropriate DataSink object based on the sink type.
    
    Parameters:
    sink_type (str): The type of data sink to create. Valid options are 'adls', 
                     'adls_with_partition', and 'delta'.
    df (DataFrame): The DataFrame to be loaded.
    path (str): The file path where the DataFrame will be saved.
    method (str): The write mode to use (e.g., 'overwrite', 'append').
    params (dict, optional): Additional parameters for the DataSink (e.g., partition columns). 
                             Defaults to None.
    
    Returns:
    DataSink: A subclass of `DataSink` that implements the `load_data_frame` method.
    
    Raises:
    ValueError: If an unsupported `sink_type` is provided.
    """
    
    # Return an ADLS Gen2 loader without partitioning
    if sink_type == "adls":
        return LoadToADLSGEN2(path, method, params, df)
    
    # Return an ADLS Gen2 loader with partitioning
    elif sink_type == "adls_with_partition":
        return LoadToADLSGEN2WithPartition(path, method, params, df)
    
    # Return a Delta format loader for ADLS Gen2
    elif sink_type == "delta":
        return LoadToADLSGEN2WithDelta(path, method, params, df)
    
    # Raise an error for unsupported sink types
    else:
        raise ValueError(f"Not implemented for this {sink_type} type")
