In [0]:
class DataSource:
    """
    Abstract base class for data sources. This class defines the structure 
    for any data source from which a DataFrame will be retrieved. 
    Subclasses must implement the `get_data_frame` method.
    """

    def __init__(self, path: str):
        """
        Initializes the DataSource class with the required attributes.
        
        Parameters:
        path (str): The file path or location from which the data will be sourced.
        """
        self.path = path  # The path or location of the data source
    
    def get_data_frame(self) -> None:
        """
        Abstract method that should be implemented by subclasses. 
        This method will handle the logic for retrieving a DataFrame from the source.
        
        Raises:
        ValueError: If the method is not defined in a subclass.
        """
        raise ValueError("Method not defined")

    

In [0]:
from pyspark.sql import DataFrame

class ParquetDataSource(DataSource):
    """
    A concrete implementation of the DataSource class for reading data from a CSV file.
    This class reads the CSV file from the provided path and loads it as a Spark DataFrame.
    """

    def get_data_frame(self) -> DataFrame:
        """
        Reads the CSV file from the specified path and returns it as a Spark DataFrame.
        This method uses the 'csv' format with the 'header' and 'inferSchema' options enabled.
        
        Returns:
        DataFrame: The loaded Spark DataFrame containing the data from the CSV file.
        """
        # Use Spark to read the CSV file with header and schema inference enabled
        return (spark.read
                    .format("csv")
                    .format("parquet")
                    .load(self.path))           # Load the parquet file from the given path


In [0]:
class CSVDataSource(DataSource):
    """
    A concrete implementation of the DataSource class for reading data from a CSV file.
    This class reads the CSV file from the provided path and loads it as a Spark DataFrame.
    """

    def get_data_frame(self) -> DataFrame:
        """
        Reads the CSV file from the specified path and returns it as a Spark DataFrame.
        This method uses the 'csv' format with the 'header' and 'inferSchema' options enabled.
        
        Returns:
        DataFrame: The loaded Spark DataFrame containing the data from the CSV file.
        """
        # Use Spark to read the CSV file with header and schema inference enabled
        return (spark.read
                    .format("csv")
                    .option("header", True)      # First row contains column headers
                    .option("inferSchema", True)  # Automatically infer the schema
                    .load(self.path)  )            # Load the CSV file from the given path

In [0]:


class DeltaDataSource(DataSource):
    """
    A concrete implementation of the DataSource class for reading data from a Delta table.
    This class reads a Delta table based on the provided table name and loads it as a Spark DataFrame.
    """

    def get_data_frame(self) -> DataFrame:
        """
        Reads the Delta table specified by the path (interpreted as the table name)
        and returns it as a Spark DataFrame.
        
        Returns:
        DataFrame: The loaded Spark DataFrame containing the data from the Delta table.
        """
        # Interpret the path as the Delta table name
        table_name: str = self.path
        
        # Use Spark to read the Delta table and return it as a DataFrame
        return spark.read.table(table_name)


In [0]:
from typing import Union

def get_data(file_type: str, file_path: str) -> Union[CSVDataSource, ParquetDataSource, DeltaDataSource]:
    """
    Factory function that returns the appropriate DataSource object based on the file type.
    
    Parameters:
    file_type (str): The type of file to read. Valid options are 'csv', 'parquet', or 'delta'.
    file_path (str): The file path or table name from which the data will be sourced.
    
    Returns:
    DataSource: An instance of a DataSource subclass (CSVDataSource, ParquetDataSource, or DeltaDataSource).
    
    Raises:
    ValueError: If the provided file_type is not supported.
    """
    
    # Check the file type and return the corresponding DataSource class
    if file_type.lower() == "csv":
        return CSVDataSource(file_path)  # Return a CSVDataSource instance
    elif file_type.lower() == "parquet":
        return ParquetDataSource(file_path)  # Return a ParquetDataSource instance
    elif file_type.lower() == "delta":
        return DeltaDataSource(file_path)  # Return a DeltaDataSource instance
    else:
        # Raise an error for unsupported file types
        raise ValueError(f"Method not implemented for datatype: {file_type}")
